From 3871f2ffe53db3cef4fe0c18993ad9e6e0f69408 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 24 Dec 2008 16:06:57 -0800 Subject: sysrq: fix ftrace help msg & doc. Impact: update documentation and help messages We have a conventional method of explicitly stating the sysrq action key in a sysrq help message, so change dump-ftrace-buffer to use that method and add it to Documentation/sysrq.txt. Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- Documentation/sysrq.txt | 2 ++ drivers/char/sysrq.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 10a0263ebb3f..56b53e005d18 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -114,6 +114,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'x' - Used by xmon interface on ppc/powerpc platforms. +'z' - Dump the ftrace buffer + '0'-'9' - Sets the console log level, controlling which kernel messages will be printed to your console. ('0', for example would make it so that only emergency messages like PANICs or OOPSes would diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 94966edfb44d..785a08ef9a13 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -283,7 +283,7 @@ static void sysrq_ftrace_dump(int key, struct tty_struct *tty) } static struct sysrq_key_op sysrq_ftrace_dump_op = { .handler = sysrq_ftrace_dump, - .help_msg = "dumpZ-ftrace-buffer", + .help_msg = "dump-ftrace-buffer(Z)", .action_msg = "Dump ftrace buffer", .enable_mask = SYSRQ_ENABLE_DUMP, }; -- cgit v1.2.3-59-g8ed1b From 412d0bb553c0227191f1bfd06100f561600bff22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Dec 2008 01:43:25 +0100 Subject: tracing/function-graph-tracer: strip ending newlines on comments Impact: tracer output improvement Ending newlines are appended automatically on comments by the function graph tracer because the newline needs to be placed after the "*/" comment characters. So if the user puts an ending newline, we want to strip it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4bf39fcae97a..bc7d90850be5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -592,6 +592,12 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, if (ent->flags & TRACE_FLAG_CONT) trace_seq_print_cont(s, iter); + /* Strip ending newline */ + if (s->buffer[s->len - 1] == '\n') { + s->buffer[s->len - 1] = '\0'; + s->len--; + } + ret = trace_seq_printf(s, " */\n"); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From c47956d9ae3341d2d1998bff26620fa3338c01e4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 23:24:11 -0500 Subject: ftrace: remove obsolete print continue functionality Impact: cleanup, remove obsolete code Now that the ring buffer used by ftrace allows for variable length entries, we do not need the 'cont' feature of the buffer. This code makes other parts of ftrace more complex and by removing this it simplifies the ftrace code. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 61 ------------------------------------ kernel/trace/trace.h | 7 ----- kernel/trace/trace_functions_graph.c | 3 -- kernel/trace/trace_mmiotrace.c | 3 -- 4 files changed, 74 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4bb3800318b..fca0233f1d73 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1765,43 +1765,6 @@ static int task_state_char(unsigned long state) return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; } -/* - * The message is supposed to contain an ending newline. - * If the printing stops prematurely, try to add a newline of our own. - */ -void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) -{ - struct trace_entry *ent; - struct trace_field_cont *cont; - bool ok = true; - - ent = peek_next_entry(iter, iter->cpu, NULL); - if (!ent || ent->type != TRACE_CONT) { - trace_seq_putc(s, '\n'); - return; - } - - do { - cont = (struct trace_field_cont *)ent; - if (ok) - ok = (trace_seq_printf(s, "%s", cont->buf) > 0); - - ftrace_disable_cpu(); - - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - else - ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); - - ftrace_enable_cpu(); - - ent = peek_next_entry(iter, iter->cpu, NULL); - } while (ent && ent->type == TRACE_CONT); - - if (!ok) - trace_seq_putc(s, '\n'); -} - static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1834,9 +1797,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) int S, T; int i; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - test_cpu_buff_start(iter); next_entry = find_next_entry(iter, NULL, &next_ts); @@ -1922,8 +1882,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_printf(s, ": %s", field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); break; } case TRACE_BRANCH: { @@ -1968,9 +1926,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - test_cpu_buff_start(iter); comm = trace_find_cmdline(iter->ent->pid); @@ -2076,8 +2031,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_printf(s, ": %s", field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); break; } case TRACE_GRAPH_RET: { @@ -2124,9 +2077,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - ret = trace_seq_printf(s, "%d %d %llu ", entry->pid, iter->cpu, iter->ts); if (!ret) @@ -2187,8 +2137,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) trace_assign_type(field, entry); trace_seq_printf(s, "# %lx %s", field->ip, field->buf); - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); break; } } @@ -2217,9 +2165,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); SEQ_PUT_HEX_FIELD_RET(s, iter->ts); @@ -2283,9 +2228,6 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - return TRACE_TYPE_HANDLED; } @@ -2296,9 +2238,6 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - if (entry->type == TRACE_CONT) - return TRACE_TYPE_HANDLED; - SEQ_PUT_FIELD_RET(s, entry->pid); SEQ_PUT_FIELD_RET(s, entry->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f864036..3a357382cce6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,7 +16,6 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, - TRACE_CONT, TRACE_STACK, TRACE_PRINT, TRACE_SPECIAL, @@ -178,7 +177,6 @@ struct trace_power { * NEED_RESCED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler - * CONT - multiple entries hold the trace item */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, @@ -186,7 +184,6 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_CONT = 0x20, }; #define TRACE_BUF_SIZE 1024 @@ -262,7 +259,6 @@ extern void __ftrace_bad_type(void); do { \ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ - IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ @@ -489,9 +485,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); -extern void trace_seq_print_cont(struct trace_seq *s, - struct trace_iterator *iter); - extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index bc7d90850be5..f261966e5b6c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -589,9 +589,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (ent->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - /* Strip ending newline */ if (s->buffer[s->len - 1] == '\n') { s->buffer[s->len - 1] = '\0'; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fffcb069f1dc..83f20ae6bd68 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -262,9 +262,6 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - if (entry->flags & TRACE_FLAG_CONT) - trace_seq_print_cont(s, iter); - return TRACE_TYPE_HANDLED; } -- cgit v1.2.3-59-g8ed1b From f0868d1e23a8efec33beb3aa688aab7fdb1ae093 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 23:24:12 -0500 Subject: ftrace: set up trace event hash infrastructure Impact: simplify/generalize/refactor trace.c The trace.c file is becoming more difficult to maintain due to the growing number of events. There is several formats that an event may be printed. This patch sets up the infrastructure of an event hash to allow for events to register how they should be printed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Makefile | 1 + kernel/trace/trace.c | 275 +------------------------- kernel/trace/trace.h | 8 +- kernel/trace/trace_boot.c | 1 + kernel/trace/trace_functions_graph.c | 1 + kernel/trace/trace_hw_branches.c | 1 + kernel/trace/trace_mmiotrace.c | 1 + kernel/trace/trace_output.c | 365 +++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.h | 43 +++++ kernel/trace/trace_power.c | 1 + 10 files changed, 416 insertions(+), 281 deletions(-) create mode 100644 kernel/trace/trace_output.c create mode 100644 kernel/trace/trace_output.h diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 349d5a93653f..549f93c9b393 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fca0233f1d73..90ce0c1d437b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -38,6 +38,7 @@ #include #include "trace.h" +#include "trace_output.h" #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) @@ -330,132 +331,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ - int len = (PAGE_SIZE - 1) - s->len; - va_list ap; - int ret; - - if (!len) - return 0; - - va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); - va_end(ap); - - /* If we can't write it all, don't bother writing anything */ - if (ret >= len) - return 0; - - s->len += ret; - - return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -static int -trace_seq_puts(struct trace_seq *s, const char *str) -{ - int len = strlen(str); - - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; -} - -static int -trace_seq_putc(struct trace_seq *s, unsigned char c) -{ - if (s->len >= (PAGE_SIZE - 1)) - return 0; - - s->buffer[s->len++] = c; - - return 1; -} - -static int -trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) -{ - if (len > ((PAGE_SIZE - 1) - s->len)) - return 0; - - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; -} - -#define MAX_MEMHEX_BYTES 8 -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - -static int -trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) -{ - unsigned char hex[HEX_CHARS]; - unsigned char *data = mem; - int i, j; - -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < len; i++) { -#else - for (i = len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - hex[j++] = ' '; - - return trace_seq_putmem(s, hex, j); -} - -static int -trace_seq_path(struct trace_seq *s, struct path *path) -{ - unsigned char *p; - - if (s->len >= (PAGE_SIZE - 1)) - return 0; - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; - } - - return 0; -} - static void trace_seq_reset(struct trace_seq *s) { @@ -1473,154 +1348,6 @@ static void s_stop(struct seq_file *m, void *p) mutex_unlock(&trace_types_lock); } -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) -{ - static const char tramp_name[] = "kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) - return "[unknown/kretprobe'd]"; - return name; -} -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ - -static int -seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - kallsyms_lookup(address, NULL, NULL, NULL, str); - - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -static int -seq_print_sym_offset(struct trace_seq *s, const char *fmt, - unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - char str[KSYM_SYMBOL_LEN]; - const char *name; - - sprint_symbol(str, address); - name = kretprobed(str); - - return trace_seq_printf(s, fmt, name); -#endif - return 1; -} - -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - -int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) -{ - int ret; - - if (!ip) - return trace_seq_printf(s, "0"); - - if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); - else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; - - if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, - unsigned long ip, unsigned long sym_flags) -{ - struct file *file = NULL; - unsigned long vmstart = 0; - int ret = 1; - - if (mm) { - const struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - vma = find_vma(mm, ip); - if (vma) { - file = vma->vm_file; - vmstart = vma->vm_start; - } - if (file) { - ret = trace_seq_path(s, &file->f_path); - if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart); - } - up_read(&mm->mmap_sem); - } - if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; -} - -static int -seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, - unsigned long sym_flags) -{ - struct mm_struct *mm = NULL; - int ret = 1; - unsigned int i; - - if (trace_flags & TRACE_ITER_SYM_USEROBJ) { - struct task_struct *task; - /* - * we do the lookup on the thread group leader, - * since individual threads might have already quit! - */ - rcu_read_lock(); - task = find_task_by_vpid(entry->ent.tgid); - if (task) - mm = get_task_mm(task); - rcu_read_unlock(); - } - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - unsigned long ip = entry->caller[i]; - - if (ip == ULONG_MAX || !ret) - break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); - if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - continue; - } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - } - - if (mm) - mmput(mm); - return ret; -} - static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3a357382cce6..6bd71fa1e1c7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -30,7 +30,7 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_POWER, - __TRACE_LAST_TYPE + __TRACE_LAST_TYPE, }; /* @@ -484,12 +484,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); -extern int -seq_print_ip_sym(struct trace_seq *s, unsigned long ip, - unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); extern long ns2usecs(cycle_t nsec); extern int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 3ccebde28482..cb2ff3e297b1 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -11,6 +11,7 @@ #include #include "trace.h" +#include "trace_output.h" static struct trace_array *boot_trace; static bool pre_initcalls_finished; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f261966e5b6c..f8ac5417afc8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -12,6 +12,7 @@ #include #include "trace.h" +#include "trace_output.h" #define TRACE_GRAPH_INDENT 2 diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index b6a3e20a49a9..879752b006b3 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -14,6 +14,7 @@ #include #include "trace.h" +#include "trace_output.h" #define SIZEOF_BTS (1 << 13) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 83f20ae6bd68..fcec59ff2355 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -11,6 +11,7 @@ #include #include "trace.h" +#include "trace_output.h" struct header_iter { struct pci_dev *dev; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c new file mode 100644 index 000000000000..1f3f80002b5e --- /dev/null +++ b/kernel/trace/trace_output.c @@ -0,0 +1,365 @@ +/* + * trace_output.c + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt + * + */ + +#include +#include +#include + +#include "trace_output.h" + +/* must be a power of 2 */ +#define EVENT_HASHSIZE 128 + +static DEFINE_MUTEX(trace_event_mutex); +static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; + +static int next_event_type = __TRACE_LAST_TYPE + 1; + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (!len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->len >= (PAGE_SIZE - 1)) + return 0; + + s->buffer[s->len++] = c; + + return 1; +} + +int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) +{ + if (len > ((PAGE_SIZE - 1) - s->len)) + return 0; + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + unsigned char *data = mem; + int i, j; + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +int trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->len >= (PAGE_SIZE - 1)) + return 0; + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + return 0; +} + +#ifdef CONFIG_KRETPROBES +static inline const char *kretprobed(const char *name) +{ + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; +} +#else +static inline const char *kretprobed(const char *name) +{ + return name; +} +#endif /* CONFIG_KRETPROBES */ + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + sprint_symbol(str, address); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + int ret = 1; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->ent.tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (i && ret) + ret = trace_seq_puts(s, " <- "); + if (!ip) { + if (ret) + ret = trace_seq_puts(s, "??"); + continue; + } + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); + } + + if (mm) + mmput(mm); + return ret; +} + +int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +/** + * ftrace_find_event - find a registered event + * @type: the type of event to look for + * + * Returns an event of type @type otherwise NULL + */ +struct trace_event *ftrace_find_event(int type) +{ + struct trace_event *event; + struct hlist_node *n; + unsigned key; + + key = type & (EVENT_HASHSIZE - 1); + + hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + if (event->type == type) + return event; + } + + return NULL; +} + +/** + * register_ftrace_event - register output for an event type + * @event: the event type to register + * + * Event types are stored in a hash and this hash is used to + * find a way to print an event. If the @event->type is set + * then it will use that type, otherwise it will assign a + * type to use. + * + * If you assign your own type, please make sure it is added + * to the trace_type enum in trace.h, to avoid collisions + * with the dynamic types. + * + * Returns the event type number or zero on error. + */ +int register_ftrace_event(struct trace_event *event) +{ + unsigned key; + int ret = 0; + + mutex_lock(&trace_event_mutex); + + if (!event->type) + event->type = next_event_type++; + else if (event->type > __TRACE_LAST_TYPE) { + printk(KERN_WARNING "Need to add type to trace.h\n"); + WARN_ON(1); + } + + if (ftrace_find_event(event->type)) + goto out; + + key = event->type & (EVENT_HASHSIZE - 1); + + hlist_add_head_rcu(&event->node, &event_hash[key]); + + ret = event->type; + out: + mutex_unlock(&trace_event_mutex); + + return ret; +} + +/** + * unregister_ftrace_event - remove a no longer used event + * @event: the event to remove + */ +int unregister_ftrace_event(struct trace_event *event) +{ + mutex_lock(&trace_event_mutex); + hlist_del(&event->node); + mutex_unlock(&trace_event_mutex); + + return 0; +} diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h new file mode 100644 index 000000000000..1fcc76e1378e --- /dev/null +++ b/kernel/trace/trace_output.h @@ -0,0 +1,43 @@ +#ifndef __TRACE_EVENTS_H +#define __TRACE_EVENTS_H + +#include "trace.h" + +typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry, + int flags); + +struct trace_event { + struct hlist_node node; + int type; + trace_print_func trace; + trace_print_func latency_trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +int trace_seq_puts(struct trace_seq *s, const char *str); +int trace_seq_putc(struct trace_seq *s, unsigned char c); +int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len); +int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len); +int trace_seq_path(struct trace_seq *s, struct path *path); +int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); + +struct trace_event *ftrace_find_event(int type); +int register_ftrace_event(struct trace_event *event); +int unregister_ftrace_event(struct trace_event *event); + +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) + +#endif + diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index a7172a352f62..b9b13c39b4bb 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -16,6 +16,7 @@ #include #include "trace.h" +#include "trace_output.h" static struct trace_array *power_trace; static int __read_mostly trace_power_enabled; -- cgit v1.2.3-59-g8ed1b From f633cef0200bbaec539e2dbb0bc4bed7f022f98b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 23 Dec 2008 23:24:13 -0500 Subject: ftrace: change trace.c to use registered events Impact: rework trace.c to use new event register API Almost every ftrace event has to implement its output display in trace.c through a different function. Some events did not handle all the formats (trace, latency-trace, raw, hex, binary), and this method does not scale well. This patch converts the format functions to use the event API to find the event and and print its format. Currently, we have a print function for trace, latency_trace, raw, hex and binary. A trace_nop_print is available if the event wants to avoid output on a particular format. Perhaps other tracers could use this in the future (like mmiotrace and function_graph). Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 402 ++++---------------------------------- kernel/trace/trace_branch.c | 53 +++++ kernel/trace/trace_output.c | 467 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.h | 16 ++ 4 files changed, 574 insertions(+), 364 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90ce0c1d437b..3f0317586cfd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1483,15 +1483,6 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, trace_seq_puts(s, " : "); } -static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; - -static int task_state_char(unsigned long state) -{ - int bit = state ? __ffs(state) + 1 : 0; - - return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; -} - static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1515,14 +1506,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *next_entry; + struct trace_event *event; unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; unsigned long abs_usecs; unsigned long rel_usecs; u64 next_ts; char *comm; - int S, T; - int i; + int ret; test_cpu_buff_start(iter); @@ -1547,94 +1538,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) lat_print_generic(s, entry, cpu); lat_print_timestamp(s, abs_usecs, rel_usecs); } - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_puts(s, " ("); - seq_print_ip_sym(s, field->parent_ip, sym_flags); - trace_seq_puts(s, ")\n"); - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); - comm = trace_find_cmdline(field->next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, entry->type == TRACE_CTX ? "==>" : " +", - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm); - break; - } - case TRACE_SPECIAL: { - struct special_entry *field; - - trace_assign_type(field, entry); - - trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - break; - } - case TRACE_STACK: { - struct stack_entry *field; - - trace_assign_type(field, entry); - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) - trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, field->caller[i], sym_flags); - } - trace_seq_puts(s, "\n"); - break; - } - case TRACE_PRINT: { - struct print_entry *field; - - trace_assign_type(field, entry); - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_printf(s, ": %s", field->buf); - break; - } - case TRACE_BRANCH: { - struct trace_branch *field; - - trace_assign_type(field, entry); - - trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line); - break; + event = ftrace_find_event(entry->type); + if (event && event->latency_trace) { + ret = event->latency_trace(s, entry, sym_flags); + if (ret) + return ret; + return TRACE_TYPE_HANDLED; } - case TRACE_USER_STACK: { - struct userstack_entry *field; - trace_assign_type(field, entry); - - seq_print_userip_objs(field, s, sym_flags); - trace_seq_putc(s, '\n'); - break; - } - default: - trace_seq_printf(s, "Unknown type %d\n", entry->type); - } + trace_seq_printf(s, "Unknown type %d\n", entry->type); return TRACE_TYPE_HANDLED; } @@ -1643,13 +1556,12 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; + struct trace_event *event; unsigned long usec_rem; unsigned long long t; unsigned long secs; char *comm; int ret; - int S, T; - int i; entry = iter->ent; @@ -1671,127 +1583,17 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - ret = seq_print_ip_sym(s, field->ip, sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - field->parent_ip) { - ret = trace_seq_printf(s, " <-"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = seq_print_ip_sym(s, - field->parent_ip, - sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_printf(s, "\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = task_state_char(field->prev_state); - ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", - field->prev_pid, - field->prev_prio, - S, - entry->type == TRACE_CTX ? "==>" : " +", - field->next_cpu, - field->next_pid, - field->next_prio, - T); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_SPECIAL: { - struct special_entry *field; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_STACK: { - struct stack_entry *field; - - trace_assign_type(field, entry); - - for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (i) { - ret = trace_seq_puts(s, " <= "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = seq_print_ip_sym(s, field->caller[i], - sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - ret = trace_seq_puts(s, "\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_PRINT: { - struct print_entry *field; - - trace_assign_type(field, entry); - - seq_print_ip_sym(s, field->ip, sym_flags); - trace_seq_printf(s, ": %s", field->buf); - break; - } - case TRACE_GRAPH_RET: { - return print_graph_function(iter); - } - case TRACE_GRAPH_ENT: { - return print_graph_function(iter); - } - case TRACE_BRANCH: { - struct trace_branch *field; - - trace_assign_type(field, entry); - - trace_seq_printf(s, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line); - break; + event = ftrace_find_event(entry->type); + if (event && event->trace) { + ret = event->trace(s, entry, sym_flags); + if (ret) + return ret; + return TRACE_TYPE_HANDLED; } - case TRACE_USER_STACK: { - struct userstack_entry *field; - - trace_assign_type(field, entry); + ret = trace_seq_printf(s, "Unknown type %d\n", entry->type); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - ret = seq_print_userip_objs(field, s, sym_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - } return TRACE_TYPE_HANDLED; } @@ -1799,8 +1601,8 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; + struct trace_event *event; int ret; - int S, T; entry = iter->ent; @@ -1809,86 +1611,26 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) if (!ret) return TRACE_TYPE_PARTIAL_LINE; - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%x %x\n", - field->ip, - field->parent_ip); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = entry->type == TRACE_WAKE ? '+' : - task_state_char(field->prev_state); - ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; - } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - break; + event = ftrace_find_event(entry->type); + if (event && event->raw) { + ret = event->raw(s, entry, 0); + if (ret) + return ret; + return TRACE_TYPE_HANDLED; } - case TRACE_PRINT: { - struct print_entry *field; - - trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%d ?\n", entry->type); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - trace_seq_printf(s, "# %lx %s", field->ip, field->buf); - break; - } - } return TRACE_TYPE_HANDLED; } -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return 0; \ -} while (0) - static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; struct trace_entry *entry; - int S, T; + struct trace_event *event; entry = iter->ent; @@ -1896,47 +1638,10 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); SEQ_PUT_HEX_FIELD_RET(s, iter->ts); - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); - break; - } - case TRACE_CTX: - case TRACE_WAKE: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - T = task_state_char(field->next_state); - S = entry->type == TRACE_WAKE ? '+' : - task_state_char(field->prev_state); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); - break; - } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - - trace_assign_type(field, entry); + event = ftrace_find_event(entry->type); + if (event && event->hex) + event->hex(s, entry, 0); - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - break; - } - } SEQ_PUT_FIELD_RET(s, newline); return TRACE_TYPE_HANDLED; @@ -1962,6 +1667,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; + struct trace_event *event; entry = iter->ent; @@ -1969,43 +1675,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); - switch (entry->type) { - case TRACE_FN: { - struct ftrace_entry *field; - - trace_assign_type(field, entry); + event = ftrace_find_event(entry->type); + if (event && event->binary) + event->binary(s, entry, 0); - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); - break; - } - case TRACE_CTX: { - struct ctx_switch_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); - break; - } - case TRACE_SPECIAL: - case TRACE_USER_STACK: - case TRACE_STACK: { - struct special_entry *field; - - trace_assign_type(field, entry); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - break; - } - } - return 1; + return TRACE_TYPE_HANDLED; } static int trace_empty(struct trace_iterator *iter) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6c00feb3bac7..c15222a01073 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -14,7 +14,9 @@ #include #include #include + #include "trace.h" +#include "trace_output.h" #ifdef CONFIG_BRANCH_TRACER @@ -142,6 +144,49 @@ static void branch_trace_reset(struct trace_array *tr) stop_branch_trace(tr); } +static int +trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, entry); + + if (seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int +trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct trace_branch *field; + + trace_assign_type(field, entry); + + if (trace_seq_printf(s, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line)) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} + +static struct trace_event trace_branch_event = { + .type = TRACE_BRANCH, + .trace = trace_branch_print, + .latency_trace = trace_branch_print, + .raw = trace_nop_print, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + struct tracer branch_trace __read_mostly = { .name = "branch", @@ -154,6 +199,14 @@ struct tracer branch_trace __read_mostly = __init static int init_branch_trace(void) { + int ret; + + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register branch events\n"); + return 1; + } + return register_tracer(&branch_trace); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1f3f80002b5e..df0c25cbed30 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -286,6 +286,15 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -363,3 +372,461 @@ int unregister_ftrace_event(struct trace_event *event) return 0; } + +/* + * Standard events + */ + +int +trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return 0; +} + +/* TRACE_FN */ +static int +trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + if (!trace_seq_puts(s, " (")) + goto partial; + if (!seq_print_ip_sym(s, field->parent_ip, flags)) + goto partial; + if (!trace_seq_puts(s, ")\n")) + goto partial; + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int +trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { + if (!trace_seq_printf(s, " <-")) + goto partial; + if (!seq_print_ip_sym(s, + field->parent_ip, + flags)) + goto partial; + } + if (!trace_seq_printf(s, "\n")) + goto partial; + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int +trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + if (trace_seq_printf(s, "%x %x\n", + field->ip, + field->parent_ip)) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} + +static int +trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + + return 0; +} + +static int +trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); + + return 0; +} + +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .trace = trace_fn_trace, + .latency_trace = trace_fn_latency, + .raw = trace_fn_raw, + .hex = trace_fn_hex, + .binary = trace_fn_bin, +}; + +/* TRACE_CTX an TRACE_WAKE */ +static int +trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags, + char *delim) +{ + struct ctx_switch_entry *field; + char *comm; + int S, T; + + trace_assign_type(field, entry); + + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); + comm = trace_find_cmdline(field->next_pid); + if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm)) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} + +static int +trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_print(s, entry, flags, "==>"); +} + +static int +trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_print(s, entry, flags, " +"); +} + +static int +trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags, + char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, entry); + + if (!S) + task_state_char(field->prev_state); + T = task_state_char(field->next_state); + if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T)) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} + +static int +trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_raw(s, entry, flags, 0); +} + +static int +trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_raw(s, entry, flags, '+'); +} + + +static int +trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags, + char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, entry); + + if (!S) + task_state_char(field->prev_state); + T = task_state_char(field->next_state); + + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); + SEQ_PUT_HEX_FIELD_RET(s, T); + + return 0; +} + +static int +trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_hex(s, entry, flags, 0); +} + +static int +trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + return trace_ctxwake_hex(s, entry, flags, '+'); +} + +static int +trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); + + return 0; +} + +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .trace = trace_ctx_print, + .latency_trace = trace_ctx_print, + .raw = trace_ctx_raw, + .hex = trace_ctx_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .trace = trace_wake_print, + .latency_trace = trace_wake_print, + .raw = trace_wake_raw, + .hex = trace_wake_hex, + .binary = trace_ctxwake_bin, +}; + +/* TRACE_SPECIAL */ +static int +trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct special_entry *field; + + trace_assign_type(field, entry); + + if (trace_seq_printf(s, "# %ld %ld %ld\n", + field->arg1, + field->arg2, + field->arg3)) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} + +static int +trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->arg3); + + return 0; +} + +static int +trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->arg1); + SEQ_PUT_FIELD_RET(s, field->arg2); + SEQ_PUT_FIELD_RET(s, field->arg3); + + return 0; +} + +static struct trace_event trace_special_event = { + .type = TRACE_SPECIAL, + .trace = trace_special_print, + .latency_trace = trace_special_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_STACK */ + +static int +trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct stack_entry *field; + int i; + + trace_assign_type(field, entry); + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (i) { + if (trace_seq_puts(s, " <= ")) + goto partial; + + if (seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; + } + if (trace_seq_puts(s, "\n")) + goto partial; + } + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .trace = trace_stack_print, + .latency_trace = trace_stack_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_USER_STACK */ +static int +trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry, + int flags) +{ + struct userstack_entry *field; + + trace_assign_type(field, entry); + + if (seq_print_userip_objs(field, s, flags)) + goto partial; + + if (trace_seq_putc(s, '\n')) + goto partial; + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .trace = trace_user_stack_print, + .latency_trace = trace_user_stack_print, + .raw = trace_special_print, + .hex = trace_special_hex, + .binary = trace_special_bin, +}; + +/* TRACE_PRINT */ +static int +trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, entry); + + if (seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int +trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, entry); + + if (seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf)) + goto partial; + + return 0; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .trace = trace_print_print, + .latency_trace = trace_print_print, + .raw = trace_print_raw, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + +static struct trace_event *events[] __initdata = { + &trace_fn_event, + &trace_ctx_event, + &trace_wake_event, + &trace_special_event, + &trace_stack_event, + &trace_user_stack_event, + &trace_print_event, + NULL +}; + +__init static int init_events(void) +{ + struct trace_event *event; + int i, ret; + + for (i = 0; events[i]; i++) { + event = events[i]; + + ret = register_ftrace_event(event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + event->type); + WARN_ON_ONCE(1); + } + } + + return 0; +} +device_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 1fcc76e1378e..ecab4ea4a4fd 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -36,8 +36,24 @@ struct trace_event *ftrace_find_event(int type); int register_ftrace_event(struct trace_event *event); int unregister_ftrace_event(struct trace_event *event); +int +trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags); + #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return 0; \ +} while (0) + #endif -- cgit v1.2.3-59-g8ed1b From dbd0b4b33074aa6b7832a9d9a5bd985eca5c1aa2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Dec 2008 20:44:51 -0800 Subject: tracing/ftrace: provide the base infrastructure for histogram tracing Impact: extend the tracing API The goal of this patch is to normalize and make more easy the implementation of statistical (histogram) tracing. It implements a trace_stat file into the /debugfs/tracing directory where one can print a one-shot output of statistics/histogram entries. A tracer has to provide two basic iterator callbacks: stat_start() => the first entry stat_next(prev, idx) => the next one. Note that it is adapted for arrays or hash tables or lists.... since it provides a pointer to the previous entry and the current index of the iterator. These two callbacks are called to get a snapshot of the statistics at each opening of the trace_stat file because. The values are so updated between two "cat trace_stat". And the tracer is free to lock its datas during the iteration to keep consistent values. Since it is almost always interesting to sort statisticals values to address the problems by priority, this infrastructure provides a "sorting" of the stat entries too if desired. A tracer has just to provide a stat_cmp callback to compare two entries and the stat tracing infrastructure will build a sorted list of the given entries. A last callback, called stat_headers, can be implemented by a tracer to output headers on its trace. If one of these callbacks is changed on runtime, it just have to signal it to the stat tracing API by calling the init_tracer_stat() helper. Changes in V2: - Fix a memory leak if the user opens multiple times the trace_stat file without closing it. Now we always free our list before rebuilding it. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/Makefile | 1 + kernel/trace/trace.c | 3 +- kernel/trace/trace.h | 17 ++++ kernel/trace/trace_stat.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 271 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/trace_stat.c diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 549f93c9b393..31cd5fbc0eed 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3f0317586cfd..b789c010512c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2354,6 +2354,7 @@ static int tracing_set_tracer(char *buf) if (ret) goto out; } + init_tracer_stat(t); trace_branch_enable(tr); out: @@ -3206,7 +3207,7 @@ __init static int tracer_alloc_buffers(void) #else current_trace = &nop_trace; #endif - + init_tracer_stat(current_trace); /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bd71fa1e1c7..05fa804d1c16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -336,6 +336,21 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + + /* + * If you change one of the following on tracing runtime, recall + * init_tracer_stat() + */ + + /* Iteration over statistic entries */ + void *(*stat_start)(void); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for sorting (optional) for stats */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); }; struct trace_seq { @@ -421,6 +436,8 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); +void init_tracer_stat(struct tracer *trace); + extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c new file mode 100644 index 000000000000..6f194a33a64a --- /dev/null +++ b/kernel/trace/trace_stat.c @@ -0,0 +1,251 @@ +/* + * Infrastructure for statistic tracing (histogram output). + * + * Copyright (C) 2008 Frederic Weisbecker + * + * Based on the code from trace_branch.c which is + * Copyright (C) 2008 Steven Rostedt + * + */ + + +#include +#include +#include +#include "trace.h" + + +/* List of stat entries from a tracer */ +struct trace_stat_list { + struct list_head list; + void *stat; +}; + +static struct trace_stat_list stat_list; + +/* + * This is a copy of the current tracer to avoid racy + * and dangerous output while the current tracer is + * switched. + */ +static struct tracer current_tracer; + +/* + * Protect both the current tracer and the global + * stat list. + */ +static DEFINE_MUTEX(stat_list_mutex); + + +static void reset_stat_list(void) +{ + struct trace_stat_list *node; + struct list_head *next; + + if (list_empty(&stat_list.list)) + return; + + node = list_entry(stat_list.list.next, struct trace_stat_list, list); + next = node->list.next; + + while (&node->list != next) { + kfree(node); + node = list_entry(next, struct trace_stat_list, list); + } + kfree(node); + + INIT_LIST_HEAD(&stat_list.list); +} + +void init_tracer_stat(struct tracer *trace) +{ + mutex_lock(&stat_list_mutex); + current_tracer = *trace; + mutex_unlock(&stat_list_mutex); +} + +/* + * For tracers that don't provide a stat_cmp callback. + * This one will force an immediate insertion on tail of + * the list. + */ +static int dummy_cmp(void *p1, void *p2) +{ + return 1; +} + +/* + * Initialize the stat list at each trace_stat file opening. + * All of these copies and sorting are required on all opening + * since the stats could have changed between two file sessions. + */ +static int stat_seq_init(void) +{ + struct trace_stat_list *iter_entry, *new_entry; + void *prev_stat; + int ret = 0; + int i; + + mutex_lock(&stat_list_mutex); + reset_stat_list(); + + if (!current_tracer.stat_start || !current_tracer.stat_next || + !current_tracer.stat_show) + goto exit; + + if (!current_tracer.stat_cmp) + current_tracer.stat_cmp = dummy_cmp; + + /* + * The first entry. Actually this is the second, but the first + * one (the stat_list head) is pointless. + */ + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + if (!new_entry) { + ret = -ENOMEM; + goto exit; + } + + INIT_LIST_HEAD(&new_entry->list); + list_add(&new_entry->list, &stat_list.list); + new_entry->stat = current_tracer.stat_start(); + + prev_stat = new_entry->stat; + + /* + * Iterate over the tracer stat entries and store them in a sorted + * list. + */ + for (i = 1; ; i++) { + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + if (!new_entry) { + ret = -ENOMEM; + goto exit_free_list; + } + + INIT_LIST_HEAD(&new_entry->list); + new_entry->stat = current_tracer.stat_next(prev_stat, i); + + /* End of insertion */ + if (!new_entry->stat) + break; + + list_for_each_entry(iter_entry, &stat_list.list, list) { + /* Insertion with a descendent sorting */ + if (current_tracer.stat_cmp(new_entry->stat, + iter_entry->stat) > 0) { + + list_add_tail(&new_entry->list, + &iter_entry->list); + break; + + /* The current smaller value */ + } else if (list_is_last(&iter_entry->list, + &stat_list.list)) { + list_add(&new_entry->list, &iter_entry->list); + break; + } + } + + prev_stat = new_entry->stat; + } +exit: + mutex_unlock(&stat_list_mutex); + return ret; + +exit_free_list: + reset_stat_list(); + mutex_unlock(&stat_list_mutex); + return ret; +} + + +static void *stat_seq_start(struct seq_file *s, loff_t *pos) +{ + struct trace_stat_list *l = (struct trace_stat_list *)s->private; + + /* Prevent from tracer switch or stat_list modification */ + mutex_lock(&stat_list_mutex); + + /* If we are in the beginning of the file, print the headers */ + if (!*pos && current_tracer.stat_headers) + current_tracer.stat_headers(s); + + return seq_list_start(&l->list, *pos); +} + +static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) +{ + struct trace_stat_list *l = (struct trace_stat_list *)s->private; + + return seq_list_next(p, &l->list, pos); +} + +static void stat_seq_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&stat_list_mutex); +} + +static int stat_seq_show(struct seq_file *s, void *v) +{ + struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + return current_tracer.stat_show(s, l->stat); +} + +static const struct seq_operations trace_stat_seq_ops = { + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show +}; + +static int tracing_stat_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &trace_stat_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = &stat_list; + ret = stat_seq_init(); + } + + return ret; +} + + +/* + * Avoid consuming memory with our now useless list. + */ +static int tracing_stat_release(struct inode *i, struct file *f) +{ + mutex_lock(&stat_list_mutex); + reset_stat_list(); + mutex_unlock(&stat_list_mutex); + return 0; +} + +static const struct file_operations tracing_stat_fops = { + .open = tracing_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_stat_release +}; + +static int __init tracing_stat_init(void) +{ + struct dentry *d_tracing; + struct dentry *entry; + + INIT_LIST_HEAD(&stat_list.list); + d_tracing = tracing_init_dentry(); + + entry = debugfs_create_file("trace_stat", 0444, d_tracing, + NULL, + &tracing_stat_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'trace_stat' entry\n"); + return 0; +} +fs_initcall(tracing_stat_init); -- cgit v1.2.3-59-g8ed1b From e302cf3f961ceb54c1dd0aff7ba8531df83be07a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 27 Dec 2008 23:25:38 +0100 Subject: tracing/branch-tracer: adapt to the stat tracing API Impact: refactor the branch tracer This patch adapts the branch tracer to the tracing API. This is a proof of concept because the branch tracer implements two "stat tracing" that were split in two files. So I added an option to the branch tracer: stat_all_branch. If it is set, then trace_stat will output all of the branches entries stats. Otherwise, it will print the annotated branches. Its is a kind of quick trick, waiting for a better solution. By default, the annotated branches stat are sorted by incorrect branch prediction percentage. Ie: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 1 100 native_smp_prepare_cpus smpboot.c 1228 0 1 100 hpet_rtc_timer_reinit hpet.c 1057 0 18032 100 sched_info_queued sched_stats.h 223 0 684 100 yield_task_fair sched_fair.c 984 0 282 100 pre_schedule_rt sched_rt.c 1263 0 13414 100 sched_info_dequeued sched_stats.h 178 0 21724 100 sched_info_switch sched_stats.h 270 0 1 100 get_signal_to_deliver signal.c 1820 0 8 100 __cancel_work_timer workqueue.c 560 0 212 100 verify_export_symbols module.c 1509 0 17 100 __rmqueue_fallback page_alloc.c 793 0 43 100 clear_page_mlock internal.h 129 0 124 100 try_to_unmap_anon rmap.c 1021 0 53 100 try_to_unmap_anon rmap.c 1013 0 6 100 vma_address rmap.c 232 0 3301 100 try_to_unmap_file rmap.c 1082 0 466 100 try_to_unmap_file rmap.c 1077 0 1 100 mem_cgroup_create memcontrol.c 1090 0 3 100 inotify_find_update_watch inotify.c 726 2 30163 99 perf_counter_task_sched_out perf_counter.c 385 1 2935 99 percpu_free allocpercpu.c 138 1544 297672 99 dentry_lru_del_init dcache.c 153 8 1074 99 input_pass_event input.c 86 1390 76781 98 mapping_unevictable pagemap.h 50 280 6665 95 pick_next_task_rt sched_rt.c 889 750 4826 86 next_pidmap pid.c 194 2 8 80 blocking_notifier_chain_regist notifier.c 220 36 130 78 ioremap_pte_range ioremap.c 22 1093 3247 74 IS_ERR err.h 34 1023 2908 73 sched_slice sched_fair.c 445 22 60 73 disk_put_part genhd.h 206 [...] It enables a developer to quickly address the source of incorrect branch predictions. Note that this sorting would be better with a second sort on the number of incorrect predictions. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 268 +++++++++++++++++++++++--------------------- 1 file changed, 142 insertions(+), 126 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c15222a01073..4785a3b9bc4a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -18,10 +18,13 @@ #include "trace.h" #include "trace_output.h" +static struct tracer branch_trace; + #ifdef CONFIG_BRANCH_TRACER static int branch_tracing_enabled __read_mostly; static DEFINE_MUTEX(branch_tracing_mutex); + static struct trace_array *branch_tracer; static void @@ -178,6 +181,7 @@ trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags) return 0; } + static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, .trace = trace_branch_print, @@ -187,30 +191,6 @@ static struct trace_event trace_branch_event = { .binary = trace_nop_print, }; -struct tracer branch_trace __read_mostly = -{ - .name = "branch", - .init = branch_trace_init, - .reset = branch_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_branch, -#endif -}; - -__init static int init_branch_trace(void) -{ - int ret; - - ret = register_ftrace_event(&trace_branch_event); - if (!ret) { - printk(KERN_WARNING "Warning: could not register branch events\n"); - return 1; - } - - return register_tracer(&branch_trace); -} - -device_initcall(init_branch_trace); #else static inline void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) @@ -236,66 +216,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) } EXPORT_SYMBOL(ftrace_likely_update); -struct ftrace_pointer { - void *start; - void *stop; - int hit; -}; +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; -static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +static int annotated_branch_stat_headers(struct seq_file *m) { - const struct ftrace_pointer *f = m->private; - struct ftrace_branch_data *p = v; - - (*pos)++; - - if (v == (void *)1) - return f->start; - - ++p; - - if ((void *)p >= (void *)f->stop) - return NULL; - - return p; + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; } -static void *t_start(struct seq_file *m, loff_t *pos) +static inline long get_incorrect_percent(struct ftrace_branch_data *p) { - void *t = (void *)1; - loff_t l = 0; - - for (; t && l < *pos; t = t_next(m, t, &l)) - ; + long percent; - return t; -} + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; -static void t_stop(struct seq_file *m, void *p) -{ + return percent; } -static int t_show(struct seq_file *m, void *v) +static int branch_stat_show(struct seq_file *m, void *v) { - const struct ftrace_pointer *fp = m->private; struct ftrace_branch_data *p = v; const char *f; long percent; - if (v == (void *)1) { - if (fp->hit) - seq_printf(m, " miss hit %% "); - else - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); - return 0; - } - /* Only print the file, not the path */ f = p->file + strlen(p->file); while (f >= p->file && *f != '/') @@ -305,11 +258,7 @@ static int t_show(struct seq_file *m, void *v) /* * The miss is overlayed on correct, and hit on incorrect. */ - if (p->correct) { - percent = p->incorrect * 100; - percent /= p->correct + p->incorrect; - } else - percent = p->incorrect ? 100 : -1; + percent = get_incorrect_percent(p); seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) @@ -320,76 +269,143 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracing_likely_seq_ops = { - .start = t_start, - .next = t_next, - .stop = t_stop, - .show = t_show, -}; +static void *annotated_branch_stat_start(void) +{ + return __start_annotated_branch_profile; +} -static int tracing_branch_open(struct inode *inode, struct file *file) +static void * +annotated_branch_stat_next(void *v, int idx) { - int ret; + struct ftrace_branch_data *p = v; - ret = seq_open(file, &tracing_likely_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = (void *)inode->i_private; - } + ++p; - return ret; + if ((void *)p >= (void *)__stop_annotated_branch_profile) + return NULL; + + return p; } -static const struct file_operations tracing_branch_fops = { - .open = tracing_branch_open, - .read = seq_read, - .llseek = seq_lseek, -}; +static int annotated_branch_stat_cmp(void *p1, void *p2) +{ + struct ftrace_branch_data *a = p1; + struct ftrace_branch_data *b = p2; + + long percent_a, percent_b; + + percent_a = get_incorrect_percent(a); + percent_b = get_incorrect_percent(b); + + if (percent_a < percent_b) + return -1; + if (percent_a > percent_b) + return 1; + else + return 0; +} #ifdef CONFIG_PROFILE_ALL_BRANCHES -extern unsigned long __start_branch_profile[]; -extern unsigned long __stop_branch_profile[]; +enum { + TRACE_BRANCH_OPT_ALL = 0x1 +}; -static const struct ftrace_pointer ftrace_branch_pos = { - .start = __start_branch_profile, - .stop = __stop_branch_profile, - .hit = 1, +static struct tracer_opt branch_opts[] = { + { TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) }, + { } }; -#endif /* CONFIG_PROFILE_ALL_BRANCHES */ +static struct tracer_flags branch_flags = { + .val = 0, + .opts = branch_opts +}; -extern unsigned long __start_annotated_branch_profile[]; -extern unsigned long __stop_annotated_branch_profile[]; +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; -static const struct ftrace_pointer ftrace_annotated_branch_pos = { - .start = __start_annotated_branch_profile, - .stop = __stop_annotated_branch_profile, -}; +static int all_branch_stat_headers(struct seq_file *m) +{ + seq_printf(m, " miss hit %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} -static __init int ftrace_branch_init(void) +static void *all_branch_stat_start(void) { - struct dentry *d_tracer; - struct dentry *entry; + return __start_branch_profile; +} + +static void * +all_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; - d_tracer = tracing_init_dentry(); + ++p; - entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, - (void *)&ftrace_annotated_branch_pos, - &tracing_branch_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'profile_annotatet_branch' entry\n"); + if ((void *)p >= (void *)__stop_branch_profile) + return NULL; -#ifdef CONFIG_PROFILE_ALL_BRANCHES - entry = debugfs_create_file("profile_branch", 0444, d_tracer, - (void *)&ftrace_branch_pos, - &tracing_branch_fops); - if (!entry) - pr_warning("Could not create debugfs" - " 'profile_branch' entry\n"); -#endif + return p; +} +static int branch_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_BRANCH_OPT_ALL) { + if (set) { + branch_trace.stat_headers = all_branch_stat_headers; + branch_trace.stat_start = all_branch_stat_start; + branch_trace.stat_next = all_branch_stat_next; + branch_trace.stat_cmp = NULL; + } else { + branch_trace.stat_headers = + annotated_branch_stat_headers; + branch_trace.stat_start = annotated_branch_stat_start; + branch_trace.stat_next = annotated_branch_stat_next; + branch_trace.stat_cmp = annotated_branch_stat_cmp; + } + init_tracer_stat(&branch_trace); + } return 0; } -device_initcall(ftrace_branch_init); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ + +static struct tracer branch_trace __read_mostly = +{ + .name = "branch", +#ifdef CONFIG_BRANCH_TRACER + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif /* CONFIG_FTRACE_SELFTEST */ +#endif /* CONFIG_BRANCH_TRACER */ + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_show = branch_stat_show, + .stat_headers = annotated_branch_stat_headers, + .stat_cmp = annotated_branch_stat_cmp, +#ifdef CONFIG_PROFILE_ALL_BRANCHES + .flags = &branch_flags, + .set_flag = branch_set_flag, +#endif +}; + +__init static int init_branch_trace(void) +{ +#ifdef CONFIG_BRANCH_TRACER + int ret; + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register branch events\n"); + return 1; + } +#endif + + return register_tracer(&branch_trace); +} +device_initcall(init_branch_trace); -- cgit v1.2.3-59-g8ed1b From f7d48cbde5c0710008caeaf7dbf14f4a9b064940 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Dec 2008 13:02:17 +0100 Subject: tracing/ftrace: make trace_find_cmdline() generally available Impact: build fix On !CONFIG_CONTEXT_SWITCH_TRACER trace_find_cmdline() is not defined: kernel/trace/trace_output.c: In function 'trace_ctxwake_print': kernel/trace/trace_output.c:499: error: implicit declaration of function 'trace_find_cmdline' kernel/trace/trace_output.c:499: warning: assignment makes pointer from integer without a cast Move it to the generic section in trace.h. Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 05fa804d1c16..a8b624ccd4d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -469,10 +469,10 @@ struct tracer_switch_ops { void *private; struct tracer_switch_ops *next; }; - -char *trace_find_cmdline(int pid); #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ +extern char *trace_find_cmdline(int pid); + #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func -- cgit v1.2.3-59-g8ed1b From 35995a4d815586bc968a857f7235707940a2f755 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:25 +0300 Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_. This patch replaces __builtin_return_address(0) with _RET_IP_, since a previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and they're widely available now. This makes for shorter and easier to read code. [penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer] Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slab.h | 8 ++++---- mm/slab.c | 8 ++++---- mm/slub.c | 48 ++++++++++++++++++++++++------------------------ 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 000da12b5cf0..c97ed28559ec 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep, * request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_track_caller(size_t, gfp_t, void*); +extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) + __kmalloc_track_caller(size, flags, _RET_IP_) #else #define kmalloc_track_caller(size, flags) \ __kmalloc(size, flags) @@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*); * allocation request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); +extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ - __builtin_return_address(0)) + _RET_IP_) #else #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node(size, flags, node) diff --git a/mm/slab.c b/mm/slab.c index 09187517f9dc..a14787799014 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3686,9 +3686,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_node); void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, void *caller) + int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, caller); + return __do_kmalloc_node(size, flags, node, (void *)caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); #else @@ -3730,9 +3730,9 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) { - return __do_kmalloc(size, flags, caller); + return __do_kmalloc(size, flags, (void *)caller); } EXPORT_SYMBOL(__kmalloc_track_caller); diff --git a/mm/slub.c b/mm/slub.c index 7ad489af9561..06da86654875 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -178,7 +178,7 @@ static LIST_HEAD(slab_caches); * Tracking user of a slab. */ struct track { - void *addr; /* Called from address */ + unsigned long addr; /* Called from address */ int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -367,7 +367,7 @@ static struct track *get_track(struct kmem_cache *s, void *object, } static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) + enum track_item alloc, unsigned long addr) { struct track *p; @@ -391,8 +391,8 @@ static void init_tracking(struct kmem_cache *s, void *object) if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); + set_track(s, object, TRACK_FREE, 0UL); + set_track(s, object, TRACK_ALLOC, 0UL); } static void print_track(const char *s, struct track *t) @@ -401,7 +401,7 @@ static void print_track(const char *s, struct track *t) return; printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); } static void print_tracking(struct kmem_cache *s, void *object) @@ -866,7 +866,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, } static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto bad; @@ -906,7 +906,7 @@ bad: } static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) + void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -1029,10 +1029,10 @@ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + struct page *page, void *object, unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1499,8 +1499,8 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) { void **object; struct page *new; @@ -1584,7 +1584,7 @@ debug: * Otherwise we can simply pick the next object from the lockless free list. */ static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) + gfp_t gfpflags, int node, unsigned long addr) { void **object; struct kmem_cache_cpu *c; @@ -1613,14 +1613,14 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + return slab_alloc(s, gfpflags, -1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + return slab_alloc(s, gfpflags, node, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif @@ -1634,7 +1634,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) + void *x, unsigned long addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1704,7 +1704,7 @@ debug: * with all sorts of special processing. */ static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) + struct page *page, void *x, unsigned long addr) { void **object = (void *)x; struct kmem_cache_cpu *c; @@ -1731,7 +1731,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, page, x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -2659,7 +2659,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, __builtin_return_address(0)); + return slab_alloc(s, flags, -1, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); @@ -2687,7 +2687,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, __builtin_return_address(0)); + return slab_alloc(s, flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2744,7 +2744,7 @@ void kfree(const void *x) put_page(page); return; } - slab_free(page->slab, page, object, __builtin_return_address(0)); + slab_free(page->slab, page, object, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3200,7 +3200,7 @@ static struct notifier_block __cpuinitdata slab_notifier = { #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; @@ -3216,7 +3216,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) + int node, unsigned long caller) { struct kmem_cache *s; @@ -3427,7 +3427,7 @@ static void resiliency_test(void) {}; struct location { unsigned long count; - void *addr; + unsigned long addr; long long sum_time; long min_time; long max_time; @@ -3475,7 +3475,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - void *caddr; + unsigned long caddr; unsigned long age = jiffies - track->when; start = -1; -- cgit v1.2.3-59-g8ed1b From b9ce08c01020eb28bfbfa6faf1c740281c5f418e Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:03 +0300 Subject: kmemtrace: Core implementation. kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected data is then fed to the userspace application in order to analyse allocation hotspots, internal fragmentation and so on, making it possible to see how well an allocator performs, as well as debug and profile kernel code. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/kernel-parameters.txt | 10 ++ MAINTAINERS | 6 + include/linux/kmemtrace.h | 85 +++++++++ init/main.c | 2 + lib/Kconfig.debug | 28 +++ mm/Makefile | 1 + mm/kmemtrace.c | 335 ++++++++++++++++++++++++++++++++++++ 7 files changed, 467 insertions(+) create mode 100644 include/linux/kmemtrace.h create mode 100644 mm/kmemtrace.c diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e0f346d201ed..542c2d8843db 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -49,6 +49,7 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. + KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1018,6 +1019,15 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. + kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } + Controls whether kmemtrace is enabled + at boot-time. + + kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of + subbufs kmemtrace's relay channel has. Set this + higher than default (KMEMTRACE_N_SUBBUFS in code) if + you experience buffer overruns. + movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter is similar to kernelcore except it specifies the amount of memory used for migratable allocations. diff --git a/MAINTAINERS b/MAINTAINERS index 618c1ef4a397..e2b3c8555051 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2565,6 +2565,12 @@ M: jason.wessel@windriver.com L: kgdb-bugreport@lists.sourceforge.net S: Maintained +KMEMTRACE +P: Eduard - Gabriel Munteanu +M: eduard.munteanu@linux360.ro +L: linux-kernel@vger.kernel.org +S: Maintained + KPROBES P: Ananth N Mavinakayanahalli M: ananth@in.ibm.com diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 000000000000..2c332010cb4e --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include +#include + +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + +#ifdef CONFIG_KMEMTRACE + +extern void kmemtrace_init(void); + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d", + type_id, call_site, (unsigned long) ptr, + bytes_req, bytes_alloc, (unsigned long) gfp_flags, node); +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu", + type_id, call_site, (unsigned long) ptr); +} + +#else /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_init(void) +{ +} + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ +} + +#endif /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_mark_alloc_node(type_id, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/init/main.c b/init/main.c index 7e117a231af1..be1fe2242a55 100644 --- a/init/main.c +++ b/init/main.c @@ -69,6 +69,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -653,6 +654,7 @@ asmlinkage void __init start_kernel(void) enable_debug_pagealloc(); cpu_hotplug_init(); kmem_cache_init(); + kmemtrace_init(); debug_objects_mem_init(); idr_init_cache(); setup_per_cpu_pageset(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b0f239e443bc..78d669b461d2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -803,6 +803,34 @@ config FIREWIRE_OHCI_REMOTE_DMA If unsure, say N. +config KMEMTRACE + bool "Kernel memory tracer (kmemtrace)" + depends on RELAY && DEBUG_FS && MARKERS + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + +config KMEMTRACE_DEFAULT_ENABLED + bool "Enabled by default at boot" + depends on KMEMTRACE + help + Say Y here to enable kmemtrace at boot-time by default. Whatever + the choice, the behavior can be overridden by a kernel parameter, + as described in documentation. + menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/Makefile b/mm/Makefile index c06b45a1ff5f..3782eb66d4b3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c new file mode 100644 index 000000000000..83ad1cc71a92 --- /dev/null +++ b/mm/kmemtrace.c @@ -0,0 +1,335 @@ +/* + * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define KMEMTRACE_SUBBUF_SIZE 524288 +#define KMEMTRACE_DEF_N_SUBBUFS 20 + +static struct rchan *kmemtrace_chan; +static u32 kmemtrace_buf_overruns; + +static unsigned int kmemtrace_n_subbufs; +#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED +static unsigned int kmemtrace_enabled = 1; +#else +static unsigned int kmemtrace_enabled = 0; +#endif + +/* + * The sequence number is used for reordering kmemtrace packets + * in userspace, since they are logged as per-CPU data. + * + * atomic_t should always be a 32-bit signed integer. Wraparound is not + * likely to occur, but userspace can deal with it by expecting a certain + * sequence number in the next packet that will be read. + */ +static atomic_t kmemtrace_seq_num; + +#define KMEMTRACE_ABI_VERSION 1 + +static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; + +enum kmemtrace_event_id { + KMEMTRACE_EVENT_ALLOC = 0, + KMEMTRACE_EVENT_FREE, +}; + +struct kmemtrace_event { + u8 event_id; + u8 type_id; + u16 event_size; + s32 seq_num; + u64 call_site; + u64 ptr; +} __attribute__ ((__packed__)); + +struct kmemtrace_stats_alloc { + u64 bytes_req; + u64 bytes_alloc; + u32 gfp_flags; + s32 numa_node; +} __attribute__ ((__packed__)); + +static void kmemtrace_probe_alloc(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + struct kmemtrace_stats_alloc *stats; + void *buf; + + local_irq_save(flags); + + buf = relay_reserve(kmemtrace_chan, + sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc)); + if (!buf) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + + ev = buf; + ev->event_id = KMEMTRACE_EVENT_ALLOC; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event) + + sizeof(struct kmemtrace_stats_alloc); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + + stats = buf + sizeof(struct kmemtrace_event); + stats->bytes_req = va_arg(*args, unsigned long); + stats->bytes_alloc = va_arg(*args, unsigned long); + stats->gfp_flags = va_arg(*args, unsigned long); + stats->numa_node = va_arg(*args, int); + +failed: + local_irq_restore(flags); +} + +static void kmemtrace_probe_free(void *probe_data, void *call_data, + const char *format, va_list *args) +{ + unsigned long flags; + struct kmemtrace_event *ev; + + local_irq_save(flags); + + ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); + if (!ev) + goto failed; + + /* + * Don't convert this to use structure initializers, + * C99 does not guarantee the rvalues evaluation order. + */ + ev->event_id = KMEMTRACE_EVENT_FREE; + ev->type_id = va_arg(*args, int); + ev->event_size = sizeof(struct kmemtrace_event); + ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); + ev->call_site = va_arg(*args, unsigned long); + ev->ptr = va_arg(*args, unsigned long); + +failed: + local_irq_restore(flags); +} + +static struct dentry * +kmemtrace_create_buf_file(const char *filename, struct dentry *parent, + int mode, struct rchan_buf *buf, int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static int kmemtrace_remove_buf_file(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static int kmemtrace_subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) { + /* + * We know it's not SMP-safe, but neither + * debugfs_create_u32() is. + */ + kmemtrace_buf_overruns++; + return 0; + } + + return 1; +} + +static struct rchan_callbacks relay_callbacks = { + .create_buf_file = kmemtrace_create_buf_file, + .remove_buf_file = kmemtrace_remove_buf_file, + .subbuf_start = kmemtrace_subbuf_start, +}; + +static struct dentry *kmemtrace_dir; +static struct dentry *kmemtrace_overruns_dentry; +static struct dentry *kmemtrace_abi_version_dentry; + +static struct dentry *kmemtrace_enabled_dentry; + +static int kmemtrace_start_probes(void) +{ + int err; + + err = marker_probe_register("kmemtrace_alloc", "type_id %d " + "call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu " + "gfp_flags %lu node %d", + kmemtrace_probe_alloc, NULL); + if (err) + return err; + err = marker_probe_register("kmemtrace_free", "type_id %d " + "call_site %lu ptr %lu", + kmemtrace_probe_free, NULL); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + marker_probe_unregister("kmemtrace_alloc", + kmemtrace_probe_alloc, NULL); + marker_probe_unregister("kmemtrace_free", + kmemtrace_probe_free, NULL); +} + +static int kmemtrace_enabled_get(void *data, u64 *val) +{ + *val = *((int *) data); + + return 0; +} + +static int kmemtrace_enabled_set(void *data, u64 val) +{ + u64 old_val = kmemtrace_enabled; + + *((int *) data) = !!val; + + if (old_val == val) + return 0; + if (val) + kmemtrace_start_probes(); + else + kmemtrace_stop_probes(); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, + kmemtrace_enabled_get, + kmemtrace_enabled_set, "%llu\n"); + +static void kmemtrace_cleanup(void) +{ + if (kmemtrace_enabled_dentry) + debugfs_remove(kmemtrace_enabled_dentry); + + kmemtrace_stop_probes(); + + if (kmemtrace_abi_version_dentry) + debugfs_remove(kmemtrace_abi_version_dentry); + if (kmemtrace_overruns_dentry) + debugfs_remove(kmemtrace_overruns_dentry); + + relay_close(kmemtrace_chan); + kmemtrace_chan = NULL; + + if (kmemtrace_dir) + debugfs_remove(kmemtrace_dir); +} + +static int __init kmemtrace_setup_late(void) +{ + if (!kmemtrace_chan) + goto failed; + + kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); + if (!kmemtrace_dir) + goto cleanup; + + kmemtrace_abi_version_dentry = + debugfs_create_u32("abi_version", S_IRUSR, + kmemtrace_dir, &kmemtrace_abi_version); + kmemtrace_overruns_dentry = + debugfs_create_u32("total_overruns", S_IRUSR, + kmemtrace_dir, &kmemtrace_buf_overruns); + if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) + goto cleanup; + + kmemtrace_enabled_dentry = + debugfs_create_file("enabled", S_IRUSR | S_IWUSR, + kmemtrace_dir, &kmemtrace_enabled, + &kmemtrace_enabled_fops); + if (!kmemtrace_enabled_dentry) + goto cleanup; + + if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) + goto cleanup; + + printk(KERN_INFO "kmemtrace: fully up.\n"); + + return 0; + +cleanup: + kmemtrace_cleanup(); +failed: + return 1; +} +late_initcall(kmemtrace_setup_late); + +static int __init kmemtrace_set_boot_enabled(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "yes")) + kmemtrace_enabled = 1; + else if (!strcmp(str, "no")) + kmemtrace_enabled = 0; + else + return -EINVAL; + + return 0; +} +early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); + +static int __init kmemtrace_set_subbufs(char *str) +{ + get_option(&str, &kmemtrace_n_subbufs); + return 0; +} +early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); + +void kmemtrace_init(void) +{ + if (!kmemtrace_enabled) + return; + + if (!kmemtrace_n_subbufs) + kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; + + kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, + kmemtrace_n_subbufs, &relay_callbacks, + NULL); + if (unlikely(!kmemtrace_chan)) { + printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); + return; + } + + if (unlikely(kmemtrace_start_probes())) + goto probe_fail; + + printk(KERN_INFO "kmemtrace: early init successful.\n"); + + return; + +probe_fail: + printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); + kmemtrace_cleanup(); +} + -- cgit v1.2.3-59-g8ed1b From aa46a7e0228c0477708ce44a0c5621902b3c157c Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:04 +0300 Subject: kmemtrace: Additional documentation. Documented kmemtrace's ABI, purpose and design. Also includes a short usage guide, FAQ, as well as a link to the userspace application's Git repository, which is currently hosted at repo.or.cz. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/ABI/testing/debugfs-kmemtrace | 71 ++++++++++++++++ Documentation/vm/kmemtrace.txt | 126 ++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 Documentation/ABI/testing/debugfs-kmemtrace create mode 100644 Documentation/vm/kmemtrace.txt diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace new file mode 100644 index 000000000000..a5ff9a64c9ee --- /dev/null +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -0,0 +1,71 @@ +What: /sys/kernel/debug/kmemtrace/ +Date: July 2008 +Contact: Eduard - Gabriel Munteanu +Description: + +In kmemtrace-enabled kernels, the following files are created: + +/sys/kernel/debug/kmemtrace/ + cpu (0400) Per-CPU tracing data, see below. (binary) + total_overruns (0400) Total number of bytes which were dropped from + cpu files because of full buffer condition, + non-binary. (text) + abi_version (0400) Kernel's kmemtrace ABI version. (text) + +Each per-CPU file should be read according to the relay interface. That is, +the reader should set affinity to that specific CPU and, as currently done by +the userspace application (though there are other methods), use poll() with +an infinite timeout before every read(). Otherwise, erroneous data may be +read. The binary data has the following _core_ format: + + Event ID (1 byte) Unsigned integer, one of: + 0 - represents an allocation (KMEMTRACE_EVENT_ALLOC) + 1 - represents a freeing of previously allocated memory + (KMEMTRACE_EVENT_FREE) + Type ID (1 byte) Unsigned integer, one of: + 0 - this is a kmalloc() / kfree() + 1 - this is a kmem_cache_alloc() / kmem_cache_free() + 2 - this is a __get_free_pages() et al. + Event size (2 bytes) Unsigned integer representing the + size of this event. Used to extend + kmemtrace. Discard the bytes you + don't know about. + Sequence number (4 bytes) Signed integer used to reorder data + logged on SMP machines. Wraparound + must be taken into account, although + it is unlikely. + Caller address (8 bytes) Return address to the caller. + Pointer to mem (8 bytes) Pointer to target memory area. Can be + NULL, but not all such calls might be + recorded. + +In case of KMEMTRACE_EVENT_ALLOC events, the next fields follow: + + Requested bytes (8 bytes) Total number of requested bytes, + unsigned, must not be zero. + Allocated bytes (8 bytes) Total number of actually allocated + bytes, unsigned, must not be lower + than requested bytes. + Requested flags (4 bytes) GFP flags supplied by the caller. + Target CPU (4 bytes) Signed integer, valid for event id 1. + If equal to -1, target CPU is the same + as origin CPU, but the reverse might + not be true. + +The data is made available in the same endianness the machine has. + +Other event ids and type ids may be defined and added. Other fields may be +added by increasing event size, but see below for details. +Every modification to the ABI, including new id definitions, are followed +by bumping the ABI version by one. + +Adding new data to the packet (features) is done at the end of the mandatory +data: + Feature size (2 byte) + Feature ID (1 byte) + Feature data (Feature size - 4 bytes) + + +Users: + kmemtrace-user - git://repo.or.cz/kmemtrace-user.git + diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt new file mode 100644 index 000000000000..75360b142ba4 --- /dev/null +++ b/Documentation/vm/kmemtrace.txt @@ -0,0 +1,126 @@ + kmemtrace - Kernel Memory Tracer + + by Eduard - Gabriel Munteanu + + +I. Introduction +=============== + +kmemtrace helps kernel developers figure out two things: +1) how different allocators (SLAB, SLUB etc.) perform +2) how kernel code allocates memory and how much + +To do this, we trace every allocation and export information to the userspace +through the relay interface. We export things such as the number of requested +bytes, the number of bytes actually allocated (i.e. including internal +fragmentation), whether this is a slab allocation or a plain kmalloc() and so +on. + +The actual analysis is performed by a userspace tool (see section III for +details on where to get it from). It logs the data exported by the kernel, +processes it and (as of writing this) can provide the following information: +- the total amount of memory allocated and fragmentation per call-site +- the amount of memory allocated and fragmentation per allocation +- total memory allocated and fragmentation in the collected dataset +- number of cross-CPU allocation and frees (makes sense in NUMA environments) + +Moreover, it can potentially find inconsistent and erroneous behavior in +kernel code, such as using slab free functions on kmalloc'ed memory or +allocating less memory than requested (but not truly failed allocations). + +kmemtrace also makes provisions for tracing on some arch and analysing the +data on another. + +II. Design and goals +==================== + +kmemtrace was designed to handle rather large amounts of data. Thus, it uses +the relay interface to export whatever is logged to userspace, which then +stores it. Analysis and reporting is done asynchronously, that is, after the +data is collected and stored. By design, it allows one to log and analyse +on different machines and different arches. + +As of writing this, the ABI is not considered stable, though it might not +change much. However, no guarantees are made about compatibility yet. When +deemed stable, the ABI should still allow easy extension while maintaining +backward compatibility. This is described further in Documentation/ABI. + +Summary of design goals: + - allow logging and analysis to be done across different machines + - be fast and anticipate usage in high-load environments (*) + - be reasonably extensible + - make it possible for GNU/Linux distributions to have kmemtrace + included in their repositories + +(*) - one of the reasons Pekka Enberg's original userspace data analysis + tool's code was rewritten from Perl to C (although this is more than a + simple conversion) + + +III. Quick usage guide +====================== + +1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable +CONFIG_KMEMTRACE and CONFIG_DEFAULT_ENABLED). + +2) Get the userspace tool and build it: +$ git-clone git://repo.or.cz/kmemtrace-user.git # current repository +$ cd kmemtrace-user/ +$ ./autogen.sh +$ ./configure +$ make + +3) Boot the kmemtrace-enabled kernel if you haven't, preferably in the +'single' runlevel (so that relay buffers don't fill up easily), and run +kmemtrace: +# '$' does not mean user, but root here. +$ mount -t debugfs none /sys/kernel/debug +$ mount -t proc none /proc +$ cd path/to/kmemtrace-user/ +$ ./kmemtraced +Wait a bit, then stop it with CTRL+C. +$ cat /sys/kernel/debug/kmemtrace/total_overruns # Check if we didn't + # overrun, should + # be zero. +$ (Optionally) [Run kmemtrace_check separately on each cpu[0-9]*.out file to + check its correctness] +$ ./kmemtrace-report + +Now you should have a nice and short summary of how the allocator performs. + +IV. FAQ and known issues +======================== + +Q: 'cat /sys/kernel/debug/kmemtrace/total_overruns' is non-zero, how do I fix +this? Should I worry? +A: If it's non-zero, this affects kmemtrace's accuracy, depending on how +large the number is. You can fix it by supplying a higher +'kmemtrace.subbufs=N' kernel parameter. +--- + +Q: kmemtrace_check reports errors, how do I fix this? Should I worry? +A: This is a bug and should be reported. It can occur for a variety of +reasons: + - possible bugs in relay code + - possible misuse of relay by kmemtrace + - timestamps being collected unorderly +Or you may fix it yourself and send us a patch. +--- + +Q: kmemtrace_report shows many errors, how do I fix this? Should I worry? +A: This is a known issue and I'm working on it. These might be true errors +in kernel code, which may have inconsistent behavior (e.g. allocating memory +with kmem_cache_alloc() and freeing it with kfree()). Pekka Enberg pointed +out this behavior may work with SLAB, but may fail with other allocators. + +It may also be due to lack of tracing in some unusual allocator functions. + +We don't want bug reports regarding this issue yet. +--- + +V. See also +=========== + +Documentation/kernel-parameters.txt +Documentation/ABI/testing/debugfs-kmemtrace + -- cgit v1.2.3-59-g8ed1b From 36555751c6751a5bdfd6d7bdf0648343bb1ef0de Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:05 +0300 Subject: kmemtrace: SLAB hooks. This adds hooks for the SLAB allocator, to allow tracing with kmemtrace. We also convert some inline functions to __always_inline to make sure _RET_IP_, which expands to __builtin_return_address(0), always works as expected. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slab_def.h | 68 ++++++++++++++++++++++++++++++++++++++++------ mm/slab.c | 71 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 123 insertions(+), 16 deletions(-) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 39c3a5eb8ebe..7555ce99f6d2 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,6 +14,7 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include +#include /* Size description struct for general caches. */ struct cache_sizes { @@ -28,8 +29,26 @@ extern struct cache_sizes malloc_sizes[]; void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); -static inline void *kmalloc(size_t size, gfp_t flags) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags); +extern size_t slab_buffer_size(struct kmem_cache *cachep); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) { + return kmem_cache_alloc(cachep, flags); +} +static inline size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return 0; +} +#endif + +static __always_inline void *kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -50,10 +69,17 @@ static inline void *kmalloc(size_t size, gfp_t flags) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, - flags); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc(malloc_sizes[i].cs_cachep, flags); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_notrace(cachep, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, slab_buffer_size(cachep), flags); + + return ret; } return __kmalloc(size, flags); } @@ -62,8 +88,25 @@ found: extern void *__kmalloc_node(size_t size, gfp_t flags, int node); extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return kmem_cache_alloc_node(cachep, flags, nodeid); +} +#endif + +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + struct kmem_cache *cachep; + void *ret; + if (__builtin_constant_p(size)) { int i = 0; @@ -84,11 +127,18 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) found: #ifdef CONFIG_ZONE_DMA if (flags & GFP_DMA) - return kmem_cache_alloc_node(malloc_sizes[i].cs_dmacachep, - flags, node); + cachep = malloc_sizes[i].cs_dmacachep; + else #endif - return kmem_cache_alloc_node(malloc_sizes[i].cs_cachep, - flags, node); + cachep = malloc_sizes[i].cs_cachep; + + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, + ret, size, slab_buffer_size(cachep), + flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/mm/slab.c b/mm/slab.c index a14787799014..b6d9b8cdefa9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -112,6 +112,7 @@ #include #include #include +#include #include #include @@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif +#ifdef CONFIG_KMEMTRACE +size_t slab_buffer_size(struct kmem_cache *cachep) +{ + return cachep->buffer_size; +} +EXPORT_SYMBOL(slab_buffer_size); +#endif + /* * Do not go above this order unless 0 objects fit into the slab. */ @@ -3613,10 +3622,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, flags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +{ + return __cache_alloc(cachep, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + /** * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. * @cachep: the cache we're checking against @@ -3661,23 +3683,47 @@ out: #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, - __builtin_return_address(0)); + void *ret = __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + obj_size(cachep), cachep->buffer_size, + flags, nodeid); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, + gfp_t flags, + int nodeid) +{ + return __cache_alloc_node(cachep, flags, nodeid, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; + void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node(cachep, flags, node); + ret = kmem_cache_alloc_node_notrace(cachep, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags, node); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3710,6 +3756,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, void *caller) { struct kmem_cache *cachep; + void *ret; /* If you want to save a few bytes .text space: replace * __ with kmem_. @@ -3719,11 +3766,17 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, cachep = __find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return __cache_alloc(cachep, flags, caller); + ret = __cache_alloc(cachep, flags, caller); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, cachep->buffer_size, flags); + + return ret; } -#ifdef CONFIG_DEBUG_SLAB +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); @@ -3762,6 +3815,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) debug_check_no_obj_freed(objp, obj_size(cachep)); __cache_free(cachep, objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp); } EXPORT_SYMBOL(kmem_cache_free); @@ -3788,6 +3843,8 @@ void kfree(const void *objp) debug_check_no_obj_freed(objp, obj_size(c)); __cache_free(c, (void *)objp); local_irq_restore(flags); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp); } EXPORT_SYMBOL(kfree); -- cgit v1.2.3-59-g8ed1b From 3eae2cb24a96509e0a38cc48dc1538a2826f4e33 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 10 Aug 2008 20:14:07 +0300 Subject: kmemtrace: SLOB hooks. This adds hooks for the SLOB allocator, to allow tracing with kmemtrace. We also convert some inline functions to __always_inline to make sure _RET_IP_, which expands to __builtin_return_address(0), always works as expected. Acked-by: Matt Mackall Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slob_def.h | 9 +++++---- mm/slob.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h index 59a3fa476ab9..0ec00b39d006 100644 --- a/include/linux/slob_def.h +++ b/include/linux/slob_def.h @@ -3,14 +3,15 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); -static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags) { return kmem_cache_alloc_node(cachep, flags, -1); } void *__kmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc_node(size, flags, node); } @@ -23,12 +24,12 @@ static inline void *kmalloc_node(size_t size, gfp_t flags, int node) * kmalloc is the normal method of allocating memory * in the kernel. */ -static inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline void *kmalloc(size_t size, gfp_t flags) { return __kmalloc_node(size, flags, -1); } -static inline void *__kmalloc(size_t size, gfp_t flags) +static __always_inline void *__kmalloc(size_t size, gfp_t flags) { return kmalloc(size, flags); } diff --git a/mm/slob.c b/mm/slob.c index cb675d126791..55de44ae5d30 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,6 +65,7 @@ #include #include #include +#include #include /* @@ -463,27 +464,38 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + void *ret; if (size < PAGE_SIZE - align) { if (!size) return ZERO_SIZE_PTR; m = slob_alloc(size + align, gfp, align, node); + if (!m) return NULL; *m = size; - return (void *)m + align; + ret = (void *)m + align; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, size + align, gfp, node); } else { - void *ret; + unsigned int order = get_order(size); - ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); + ret = slob_new_page(gfp | __GFP_COMP, order, node); if (ret) { struct page *page; page = virt_to_page(ret); page->private = size; } - return ret; + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << order, gfp, node); } + + return ret; } EXPORT_SYMBOL(__kmalloc_node); @@ -501,6 +513,8 @@ void kfree(const void *block) slob_free(m, *m + align); } else put_page(&sp->page); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block); } EXPORT_SYMBOL(kfree); @@ -569,10 +583,19 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; - if (c->size < PAGE_SIZE) + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); - else + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); + } else { b = slob_new_page(flags, get_order(c->size), node); + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, + _RET_IP_, b, c->size, + PAGE_SIZE << get_order(c->size), + flags, node); + } if (c->ctor) c->ctor(b); @@ -608,6 +631,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } else { __kmem_cache_free(b, c->size); } + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b); } EXPORT_SYMBOL(kmem_cache_free); -- cgit v1.2.3-59-g8ed1b From 5b882be4e00e53a44f47ad7eb997cac2938848bf Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:26 +0300 Subject: kmemtrace: SLUB hooks. This adds hooks for the SLUB allocator, to allow tracing with kmemtrace. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 53 ++++++++++++++++++++++++++++++++++++--- mm/slub.c | 65 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 109 insertions(+), 9 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aacd..dc28432b5b9a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,6 +10,7 @@ #include #include #include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -204,13 +205,31 @@ static __always_inline struct kmem_cache *kmalloc_slab(size_t size) void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *__kmalloc(size_t size, gfp_t flags); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags); +#else +static __always_inline void * +kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return kmem_cache_alloc(s, gfpflags); +} +#endif + static __always_inline void *kmalloc_large(size_t size, gfp_t flags) { - return (void *)__get_free_pages(flags | __GFP_COMP, get_order(size)); + unsigned int order = get_order(size); + void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret, + size, PAGE_SIZE << order, flags); + + return ret; } static __always_inline void *kmalloc(size_t size, gfp_t flags) { + void *ret; + if (__builtin_constant_p(size)) { if (size > PAGE_SIZE) return kmalloc_large(size, flags); @@ -221,7 +240,13 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc(s, flags); + ret = kmem_cache_alloc_notrace(s, flags); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags); + + return ret; } } return __kmalloc(size, flags); @@ -231,8 +256,24 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) void *__kmalloc_node(size_t size, gfp_t flags, int node); void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); +#ifdef CONFIG_KMEMTRACE +extern void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node); +#else +static __always_inline void * +kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return kmem_cache_alloc_node(s, gfpflags, node); +} +#endif + static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { + void *ret; + if (__builtin_constant_p(size) && size <= PAGE_SIZE && !(flags & SLUB_DMA)) { struct kmem_cache *s = kmalloc_slab(size); @@ -240,7 +281,13 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) if (!s) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node(s, flags, node); + ret = kmem_cache_alloc_node_notrace(s, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _THIS_IP_, ret, + size, s->size, flags, node); + + return ret; } return __kmalloc_node(size, flags, node); } diff --git a/mm/slub.c b/mm/slub.c index 06da86654875..4c48a0146afd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * Lock order: @@ -1613,18 +1614,46 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc); +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +{ + return slab_alloc(s, gfpflags, -1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_notrace); +#endif + #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret, + s->objsize, s->size, gfpflags, node); + + return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); #endif +#ifdef CONFIG_KMEMTRACE +void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, + gfp_t gfpflags, + int node) +{ + return slab_alloc(s, gfpflags, node, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +#endif + /* * Slow patch handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -1732,6 +1761,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x) page = virt_to_head_page(x); slab_free(s, page, x, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); @@ -2650,6 +2681,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, flags); @@ -2659,7 +2691,12 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, -1, _RET_IP_); + + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags); + + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -2678,16 +2715,30 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; + void *ret; - if (unlikely(size > PAGE_SIZE)) - return kmalloc_large_node(size, flags, node); + if (unlikely(size > PAGE_SIZE)) { + ret = kmalloc_large_node(size, flags, node); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + _RET_IP_, ret, + size, PAGE_SIZE << get_order(size), + flags, node); + + return ret; + } s = get_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, flags, node, _RET_IP_); + ret = slab_alloc(s, flags, node, _RET_IP_); + + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret, + size, s->size, flags, node); + + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif @@ -2745,6 +2796,8 @@ void kfree(const void *x) return; } slab_free(page->slab, page, object, _RET_IP_); + + kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x); } EXPORT_SYMBOL(kfree); -- cgit v1.2.3-59-g8ed1b From 4a80b24bb2ec66a5cb7fa5ff8335907f09288200 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:27 +0300 Subject: kmemtrace: Fix typos in documentation. Corrected the ABI description and the kmemtrace usage guide. Thanks to Randy Dunlap for noticing these errors. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/ABI/testing/debugfs-kmemtrace | 2 +- Documentation/vm/kmemtrace.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/debugfs-kmemtrace b/Documentation/ABI/testing/debugfs-kmemtrace index a5ff9a64c9ee..5e6a92a02d85 100644 --- a/Documentation/ABI/testing/debugfs-kmemtrace +++ b/Documentation/ABI/testing/debugfs-kmemtrace @@ -63,7 +63,7 @@ Adding new data to the packet (features) is done at the end of the mandatory data: Feature size (2 byte) Feature ID (1 byte) - Feature data (Feature size - 4 bytes) + Feature data (Feature size - 3 bytes) Users: diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt index 75360b142ba4..f656cac3558c 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/vm/kmemtrace.txt @@ -61,7 +61,7 @@ III. Quick usage guide ====================== 1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE and CONFIG_DEFAULT_ENABLED). +CONFIG_KMEMTRACE and CONFIG_KMEMTRACE_DEFAULT_ENABLED). 2) Get the userspace tool and build it: $ git-clone git://repo.or.cz/kmemtrace-user.git # current repository -- cgit v1.2.3-59-g8ed1b From 73cd6af0413225b0ada8b8881c3e0cfd26506dfa Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:24 +0300 Subject: kmemtrace: Better alternative to "kmemtrace: fix printk format warnings". Fix the problem "kmemtrace: fix printk format warnings" attempted to fix, but resulted in marker-probe format mismatch warnings. Instead of carrying size_t into probes, we get rid of it by casting to unsigned long, just as we did with gfp_t. This way, we don't need to change marker format strings and we don't have to rely on other format specifiers like "%zu", making for consistent use of more generic data types (since there are no format specifiers for gfp_t, for example). Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/kmemtrace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h index 2c332010cb4e..5bea8ead6a6b 100644 --- a/include/linux/kmemtrace.h +++ b/include/linux/kmemtrace.h @@ -33,7 +33,8 @@ static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu " "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d", type_id, call_site, (unsigned long) ptr, - bytes_req, bytes_alloc, (unsigned long) gfp_flags, node); + (unsigned long) bytes_req, (unsigned long) bytes_alloc, + (unsigned long) gfp_flags, node); } static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, -- cgit v1.2.3-59-g8ed1b From 94b528d0564997d88e8bf166d8c9080536ad8bdc Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Sun, 24 Aug 2008 20:49:35 +0300 Subject: kmemtrace: SLUB hooks for caller-tracking functions. This patch adds kmemtrace hooks for __kmalloc_track_caller() and __kmalloc_node_track_caller(). Currently, they set the call site pointer to the value recieved as a parameter. (This could change if we implement stack trace exporting in kmemtrace.) Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- mm/slub.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4c48a0146afd..4ce61c86f581 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3256,6 +3256,7 @@ static struct notifier_block __cpuinitdata slab_notifier = { void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large(size, gfpflags); @@ -3265,13 +3266,20 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, -1, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, (unsigned long) caller, + ret, size, s->size, gfpflags); + + return ret; } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { struct kmem_cache *s; + void *ret; if (unlikely(size > PAGE_SIZE)) return kmalloc_large_node(size, gfpflags, node); @@ -3281,7 +3289,14 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - return slab_alloc(s, gfpflags, node, caller); + ret = slab_alloc(s, gfpflags, node, caller); + + /* Honor the call site pointer we recieved. */ + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, + (unsigned long) caller, ret, + size, s->size, gfpflags, node); + + return ret; } #ifdef CONFIG_SLUB_DEBUG -- cgit v1.2.3-59-g8ed1b From 2e67624c22321fa40ad3aa89c307c84bd679d9b2 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 1 Sep 2008 10:11:54 +0300 Subject: kmemtrace: remove unnecessary casts Now that we use _RET_IP_ there's no need to cast 'caller' to unsigned long. Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4ce61c86f581..e27472368cc3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3269,8 +3269,8 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) ret = slab_alloc(s, gfpflags, -1, caller); /* Honor the call site pointer we recieved. */ - kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, (unsigned long) caller, - ret, size, s->size, gfpflags); + kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size, + s->size, gfpflags); return ret; } @@ -3292,8 +3292,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, ret = slab_alloc(s, gfpflags, node, caller); /* Honor the call site pointer we recieved. */ - kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, - (unsigned long) caller, ret, + kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret, size, s->size, gfpflags, node); return ret; -- cgit v1.2.3-59-g8ed1b From faa97abe6a3673af268abb661c6b663252a911aa Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 10 Oct 2008 10:57:44 +0300 Subject: kmemtrace: allow kmemtrace to be enabled after boot The kmemtrace_init() function returns early if kmemtrace is disabled at boot causing kmemtrace_setup_late() to also bail out on NULL channel. This has the unfortunate side effect that none of the debugfs files needed to enable kmemtrace after boot are created. Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- mm/kmemtrace.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c index 83ad1cc71a92..f7a49c077df2 100644 --- a/mm/kmemtrace.c +++ b/mm/kmemtrace.c @@ -307,29 +307,29 @@ early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); void kmemtrace_init(void) { - if (!kmemtrace_enabled) - return; - if (!kmemtrace_n_subbufs) kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, kmemtrace_n_subbufs, &relay_callbacks, NULL); - if (unlikely(!kmemtrace_chan)) { + if (!kmemtrace_chan) { printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); return; } - if (unlikely(kmemtrace_start_probes())) - goto probe_fail; - - printk(KERN_INFO "kmemtrace: early init successful.\n"); - - return; + if (!kmemtrace_enabled) { + printk(KERN_INFO "kmemtrace: disabled. Pass " + "kemtrace.enable=yes as kernel parameter for " + "boot-time tracing."); + return; + } + if (kmemtrace_start_probes()) { + printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); + kmemtrace_cleanup(); + return; + } -probe_fail: - printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); - kmemtrace_cleanup(); + printk(KERN_INFO "kmemtrace: enabled.\n"); } -- cgit v1.2.3-59-g8ed1b From bf6803d6fd654d9a73cd90308b5225d78655d027 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Fri, 10 Oct 2008 11:02:59 +0300 Subject: kmemtrace: remove config option for enabling tracing at boot Users can pass kmemtrace.enabled=yes as a kernel parameter to enable kmemtrace at boot so remove the useless CONFIG_KMEMTRACE_DEFAULT_ENABLED config option. Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- Documentation/vm/kmemtrace.txt | 2 +- lib/Kconfig.debug | 8 -------- mm/kmemtrace.c | 8 +++----- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/Documentation/vm/kmemtrace.txt b/Documentation/vm/kmemtrace.txt index f656cac3558c..a956d9b7f943 100644 --- a/Documentation/vm/kmemtrace.txt +++ b/Documentation/vm/kmemtrace.txt @@ -61,7 +61,7 @@ III. Quick usage guide ====================== 1) Get a kernel that supports kmemtrace and build it accordingly (i.e. enable -CONFIG_KMEMTRACE and CONFIG_KMEMTRACE_DEFAULT_ENABLED). +CONFIG_KMEMTRACE). 2) Get the userspace tool and build it: $ git-clone git://repo.or.cz/kmemtrace-user.git # current repository diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 78d669b461d2..b5417e23ba94 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -823,14 +823,6 @@ config KMEMTRACE If unsure, say N. -config KMEMTRACE_DEFAULT_ENABLED - bool "Enabled by default at boot" - depends on KMEMTRACE - help - Say Y here to enable kmemtrace at boot-time by default. Whatever - the choice, the behavior can be overridden by a kernel parameter, - as described in documentation. - menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c index f7a49c077df2..f7704f52c4b1 100644 --- a/mm/kmemtrace.c +++ b/mm/kmemtrace.c @@ -19,11 +19,9 @@ static struct rchan *kmemtrace_chan; static u32 kmemtrace_buf_overruns; static unsigned int kmemtrace_n_subbufs; -#ifdef CONFIG_KMEMTRACE_DEFAULT_ENABLED -static unsigned int kmemtrace_enabled = 1; -#else -static unsigned int kmemtrace_enabled = 0; -#endif + +/* disabled by default */ +static unsigned int kmemtrace_enabled; /* * The sequence number is used for reordering kmemtrace packets -- cgit v1.2.3-59-g8ed1b From a4900437f3d76761a1646cd90254ccb01714a9ed Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 6 Nov 2008 17:42:18 +0100 Subject: kmemtrace: add missing newline This was causing artifacts in my dmesg. Acked-by: Eduard - Gabriel Munteanu Signed-off-by: Vegard Nossum Signed-off-by: Pekka Enberg --- mm/kmemtrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c index f7704f52c4b1..2a70a805027c 100644 --- a/mm/kmemtrace.c +++ b/mm/kmemtrace.c @@ -319,7 +319,7 @@ void kmemtrace_init(void) if (!kmemtrace_enabled) { printk(KERN_INFO "kmemtrace: disabled. Pass " "kemtrace.enable=yes as kernel parameter for " - "boot-time tracing."); + "boot-time tracing.\n"); return; } if (kmemtrace_start_probes()) { -- cgit v1.2.3-59-g8ed1b From a103e2ab7377dbbef2506be59c49a3f2ae10b60b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Dec 2008 15:07:47 +0100 Subject: tracing/selftest: remove TRACE_CONT reference Impact: build fix TRACE_CONT is gone - fix up the self-test too. Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 88c8eb70f54a..5013812578b1 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: - case TRACE_CONT: case TRACE_STACK: case TRACE_PRINT: case TRACE_SPECIAL: -- cgit v1.2.3-59-g8ed1b From 2a38b1c4f123afa4579cb1d29380b337e9195cbb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Dec 2008 15:29:04 +0100 Subject: kmemtrace: move #include lines Impact: avoid conflicts with kmemcheck kmemcheck modifies the same area of slab.c and slub.c - move the include lines up a bit. Signed-off-by: Ingo Molnar --- mm/slab.c | 2 +- mm/slub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index b6d9b8cdefa9..bcf08ea88380 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -112,7 +113,6 @@ #include #include #include -#include #include #include diff --git a/mm/slub.c b/mm/slub.c index 4cd7bfd2ab2c..cc4001fee7ac 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,6 @@ #include #include #include -#include /* * Lock order: -- cgit v1.2.3-59-g8ed1b From 7a51cffbd10886c0557677dd916c090097c691ef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 29 Dec 2008 16:03:40 +0100 Subject: relayfs: replace BUG() with WARN_ON() in relay_late_setup_files() Impact: turn boot crash into boot warning This BUG() can trigger: [ 16.684131] initcall fail_page_alloc_debugfs+0x0/0xc1 returned 0 after 0 usecs [ 16.692035] calling kmemtrace_setup_late+0x0/0xd5 @ 1 [ 16.700087] relay_late_setup_files: CPU 1 has no buffer, it must have! [ 16.704044] ------------[ cut here ]------------ [ 16.708030] kernel BUG at kernel/relay.c:680! [ 16.708030] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 16.708030] last sysfs file: [ 16.708030] [ 16.708030] Pid: 1, comm: swapper Not tainted (2.6.28-tip-03903-g9a39f58-dirty #13207) System Product Name [ 16.708030] EIP: 0060:[] EFLAGS: 00010246 CPU: 1 [ 16.708030] EIP is at relay_late_setup_files+0x8c/0x176 Reduce it to a more reportable WARN_ONCE(). Signed-off-by: Ingo Molnar --- kernel/relay.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 09ac2008f77b..d06450670c86 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -675,9 +675,7 @@ int relay_late_setup_files(struct rchan *chan, */ for_each_online_cpu(i) { if (unlikely(!chan->buf[i])) { - printk(KERN_ERR "relay_late_setup_files: CPU %u " - "has no buffer, it must have!\n", i); - BUG(); + WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); err = -EINVAL; break; } -- cgit v1.2.3-59-g8ed1b From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 29 Dec 2008 13:42:23 -0800 Subject: tracing/kmemtrace: normalize the raw tracer event to the unified tracing API Impact: new tracer plugin This patch adapts kmemtrace raw events tracing to the unified tracing API. To enable and use this tracer, just do the following: echo kmemtrace > /debugfs/tracing/current_tracer cat /debugfs/tracing/trace You will have the following output: # tracer: kmemtrace # # # ALLOC TYPE REQ GIVEN FLAGS POINTER NODE CALLER # FREE | | | | | | | | # | type_id 1 call_site 18446744071565527833 ptr 18446612134395152256 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1 type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1 type_id 1 call_site 18446744071565585534 ptr 18446612134405955584 That was to stay backward compatible with the format output produced in inux/tracepoint.h. This is the default ouput, but note that I tried something else. If you change an option: echo kmem_minimalistic > /debugfs/trace_options and then cat /debugfs/trace, you will have the following output: # tracer: kmemtrace # # # ALLOC TYPE REQ GIVEN FLAGS POINTER NODE CALLER # FREE | | | | | | | | # | - C 0xffff88007c088780 file_free_rcu + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc780 -1 d_alloc - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc870 -1 d_alloc - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname + K 240 240 000000d0 0xffff8800790dc960 -1 d_alloc + K 1304 1312 000000d0 0xffff8800791d7340 -1 reiserfs_alloc_inode - C 0xffff88007cad6000 putname + K 4096 4096 000000d0 0xffff88007cad6000 -1 getname - C 0xffff88007cad6000 putname + K 992 1000 000000d0 0xffff880079045b58 -1 alloc_inode + K 768 1024 000080d0 0xffff88007c096400 -1 alloc_pipe_info + K 240 240 000000d0 0xffff8800790dca50 -1 d_alloc + K 272 320 000080d0 0xffff88007c088780 -1 get_empty_filp + K 272 320 000080d0 0xffff88007c088000 -1 get_empty_filp Yeah I shall confess kmem_minimalistic should be: kmem_alternative. Whatever, I find it more readable but this a personal opinion of course. We can drop it if you want. On the ALLOC/FREE column, + means an allocation and - a free. On the type column, you have K = kmalloc, C = cache, P = page I would like the flags to be GFP_* strings but that would not be easy to not break the column with strings.... About the node...it seems to always be -1. I don't know why but that shouldn't be difficult to find. I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would be more easy to find the tracer headers if they are all in their common directory. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/kmemtrace.h | 86 ------------ include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/trace/kmemtrace.h | 75 ++++++++++ init/main.c | 2 +- kernel/trace/Kconfig | 22 +++ kernel/trace/Makefile | 1 + kernel/trace/kmemtrace.c | 343 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 25 ++++ lib/Kconfig.debug | 20 --- mm/kmemtrace.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 13 files changed, 472 insertions(+), 112 deletions(-) delete mode 100644 include/linux/kmemtrace.h create mode 100644 include/trace/kmemtrace.h create mode 100644 kernel/trace/kmemtrace.c diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h deleted file mode 100644 index 5bea8ead6a6b..000000000000 --- a/include/linux/kmemtrace.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ - -#include -#include - -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -#ifdef CONFIG_KMEMTRACE - -extern void kmemtrace_init(void); - -static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d", - type_id, call_site, (unsigned long) ptr, - (unsigned long) bytes_req, (unsigned long) bytes_alloc, - (unsigned long) gfp_flags, node); -} - -static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu", - type_id, call_site, (unsigned long) ptr); -} - -#else /* CONFIG_KMEMTRACE */ - -static inline void kmemtrace_init(void) -{ -} - -static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ -} - -static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ -} - -#endif /* CONFIG_KMEMTRACE */ - -static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_mark_alloc_node(type_id, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 7555ce99f6d2..455f9affea9a 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,7 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include +#include /* Size description struct for general caches. */ struct cache_sizes { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index dc28432b5b9a..6b657f7dcb2b 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h new file mode 100644 index 000000000000..ad8b7857855a --- /dev/null +++ b/include/trace/kmemtrace.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include +#include + +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + +#ifdef CONFIG_KMEMTRACE + +extern void kmemtrace_init(void); + +extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node); + +extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr); + +#else /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_init(void) +{ +} + +static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ +} + +static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ +} + +#endif /* CONFIG_KMEMTRACE */ + +static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_mark_alloc_node(type_id, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/init/main.c b/init/main.c index 9711586aa7c9..beca7aaddb22 100644 --- a/init/main.c +++ b/init/main.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e2a4ff6fc3a6..27fb74b06b3c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -264,6 +264,28 @@ config HW_BRANCH_TRACER This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. +config KMEMTRACE + bool "Trace SLAB allocations" + select TRACING + depends on RELAY + help + kmemtrace provides tracing for slab allocator functions, such as + kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected + data is then fed to the userspace application in order to analyse + allocation hotspots, internal fragmentation and so on, making it + possible to see how well an allocator performs, as well as debug + and profile kernel code. + + This requires an userspace application to use. See + Documentation/vm/kmemtrace.txt for more information. + + Saying Y will make the kernel somewhat larger and slower. However, + if you disable kmemtrace at run-time or boot-time, the performance + impact is minimal (depending on the arch the kernel is built for). + + If unsure, say N. + + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 349d5a93653f..513dc86b5dfa 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o +obj-$(CONFIG_KMEMTRACE) += kmemtrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c new file mode 100644 index 000000000000..d69cbe3c2a4b --- /dev/null +++ b/kernel/trace/kmemtrace.c @@ -0,0 +1,343 @@ +/* + * Memory allocator tracing + * + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * Copyright (C) 2008 Pekka Enberg + * Copyright (C) 2008 Frederic Weisbecker + */ + +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_KMEM_OPT_MINIMAL 0x1 + +static struct tracer_opt kmem_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, + { } +}; + +static struct tracer_flags kmem_tracer_flags = { + .val = 0, + .opts = kmem_opts +}; + + +static bool kmem_tracing_enabled __read_mostly; +static struct trace_array *kmemtrace_array; + +static int kmem_trace_init(struct trace_array *tr) +{ + int cpu; + kmemtrace_array = tr; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); + + kmem_tracing_enabled = true; + + return 0; +} + +static void kmem_trace_reset(struct trace_array *tr) +{ + kmem_tracing_enabled = false; +} + +static void kmemtrace_headers(struct seq_file *s) +{ + /* Don't need headers for the original kmemtrace output */ + if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) + return; + + seq_printf(s, "#\n"); + seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " + " POINTER NODE CALLER\n"); + seq_printf(s, "# FREE | | | | " + " | | | |\n"); + seq_printf(s, "# |\n\n"); +} + +/* + * The two following functions give the original output from kmemtrace, + * or something close to....perhaps they need some missing things + */ +static enum print_line_t +kmemtrace_print_alloc_original(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Taken from the old linux/kmemtrace.h */ + ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " + "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", + entry->type_id, entry->call_site, (unsigned long) entry->ptr, + (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, + (unsigned long) entry->gfp_flags, entry->node); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_original(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Taken from the old linux/kmemtrace.h */ + ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", + entry->type_id, entry->call_site, (unsigned long) entry->ptr); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +/* The two other following provide a more minimalistic output */ +static enum print_line_t +kmemtrace_print_alloc_compress(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Alloc entry */ + ret = trace_seq_printf(s, " + "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Type */ + switch (entry->type_id) { + case KMEMTRACE_TYPE_KMALLOC: + ret = trace_seq_printf(s, "K "); + break; + case KMEMTRACE_TYPE_CACHE: + ret = trace_seq_printf(s, "C "); + break; + case KMEMTRACE_TYPE_PAGES: + ret = trace_seq_printf(s, "P "); + break; + default: + ret = trace_seq_printf(s, "? "); + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Requested */ + ret = trace_seq_printf(s, "%4d ", entry->bytes_req); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Allocated */ + ret = trace_seq_printf(s, "%4d ", entry->bytes_alloc); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Flags + * TODO: would be better to see the name of the GFP flag names + */ + ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Pointer to allocated */ + ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Node */ + ret = trace_seq_printf(s, "%4d ", entry->node); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Call site */ + ret = seq_print_ip_sym(s, entry->call_site, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (!trace_seq_printf(s, "\n")) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +kmemtrace_print_free_compress(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) +{ + struct trace_seq *s = &iter->seq; + int ret; + + /* Free entry */ + ret = trace_seq_printf(s, " - "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Type */ + switch (entry->type_id) { + case KMEMTRACE_TYPE_KMALLOC: + ret = trace_seq_printf(s, "K "); + break; + case KMEMTRACE_TYPE_CACHE: + ret = trace_seq_printf(s, "C "); + break; + case KMEMTRACE_TYPE_PAGES: + ret = trace_seq_printf(s, "P "); + break; + default: + ret = trace_seq_printf(s, "? "); + } + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Skip requested/allocated/flags */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Pointer to allocated */ + ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Skip node */ + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Call site */ + ret = seq_print_ip_sym(s, entry->call_site, 0); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (!trace_seq_printf(s, "\n")) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + + switch (entry->type) { + case TRACE_KMEM_ALLOC: { + struct kmemtrace_alloc_entry *field; + trace_assign_type(field, entry); + if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) + return kmemtrace_print_alloc_compress(iter, field); + else + return kmemtrace_print_alloc_original(iter, field); + } + + case TRACE_KMEM_FREE: { + struct kmemtrace_free_entry *field; + trace_assign_type(field, entry); + if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) + return kmemtrace_print_free_compress(iter, field); + else + return kmemtrace_print_free_original(iter, field); + } + + default: + return TRACE_TYPE_UNHANDLED; + } +} + +/* Trace allocations */ +void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + struct ring_buffer_event *event; + struct kmemtrace_alloc_entry *entry; + struct trace_array *tr = kmemtrace_array; + unsigned long irq_flags; + + if (!kmem_tracing_enabled) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_ALLOC; + entry->call_site = call_site; + entry->ptr = ptr; + entry->bytes_req = bytes_req; + entry->bytes_alloc = bytes_alloc; + entry->gfp_flags = gfp_flags; + entry->node = node; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + +void kmemtrace_mark_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + struct ring_buffer_event *event; + struct kmemtrace_free_entry *entry; + struct trace_array *tr = kmemtrace_array; + unsigned long irq_flags; + + if (!kmem_tracing_enabled) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_FREE; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + +static struct tracer kmem_tracer __read_mostly = { + .name = "kmemtrace", + .init = kmem_trace_init, + .reset = kmem_trace_reset, + .print_line = kmemtrace_print_line, + .print_header = kmemtrace_headers, + .flags = &kmem_tracer_flags +}; + +static int __init init_kmem_tracer(void) +{ + return register_tracer(&kmem_tracer); +} + +device_initcall(init_kmem_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cc7a4f864036..534505bb39b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,6 +9,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -29,6 +30,8 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, + TRACE_KMEM_ALLOC, + TRACE_KMEM_FREE, TRACE_POWER, __TRACE_LAST_TYPE @@ -170,6 +173,24 @@ struct trace_power { struct power_trace state_data; }; +struct kmemtrace_alloc_entry { + struct trace_entry ent; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; + size_t bytes_req; + size_t bytes_alloc; + gfp_t gfp_flags; + int node; +}; + +struct kmemtrace_free_entry { + struct trace_entry ent; + enum kmemtrace_type_id type_id; + unsigned long call_site; + const void *ptr; +}; + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ + IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ + TRACE_KMEM_ALLOC); \ + IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ + TRACE_KMEM_FREE); \ __ftrace_bad_type(); \ } while (0) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b5417e23ba94..b0f239e443bc 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -803,26 +803,6 @@ config FIREWIRE_OHCI_REMOTE_DMA If unsure, say N. -config KMEMTRACE - bool "Kernel memory tracer (kmemtrace)" - depends on RELAY && DEBUG_FS && MARKERS - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/vm/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - menuconfig BUILD_DOCSRC bool "Build targets in Documentation/ tree" depends on HEADERS_CHECK diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c index 2a70a805027c..0573b5080cc4 100644 --- a/mm/kmemtrace.c +++ b/mm/kmemtrace.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #define KMEMTRACE_SUBBUF_SIZE 524288 #define KMEMTRACE_DEF_N_SUBBUFS 20 diff --git a/mm/slob.c b/mm/slob.c index 0f1a49f40690..4d1c0fc33b6b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include /* diff --git a/mm/slub.c b/mm/slub.c index cc4001fee7ac..7bf8cf8ec082 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From b6ab4afee4ed56d0f69df59485585cff828c327d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Dec 2008 09:41:04 +0100 Subject: tracing, kvm: change MARKERS to select instead of depends on Impact: build fix fix: kernel/trace/Kconfig:42:error: found recursive dependency: TRACING -> TRACEPOINTS -> MARKERS -> KVM_TRACE -> RELAY -> KMEMTRACE -> TRACING markers is a facility that should be selected - not depended on by an interactive Kconfig entry. Signed-off-by: Ingo Molnar --- arch/x86/kvm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b81125f0bdee..c7da3683f4c5 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -55,7 +55,8 @@ config KVM_AMD config KVM_TRACE bool "KVM trace support" - depends on KVM && MARKERS && SYSFS + depends on KVM && SYSFS + select MARKERS select RELAY select DEBUG_FS default n -- cgit v1.2.3-59-g8ed1b From 3fd4bc015ef879a7d2b955ce97fb125e3a51ba7e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Dec 2008 12:07:27 +0100 Subject: tracing/kmemtrace: export kmemtrace_mark_alloc_node() / kmemtrace_mark_free() Impact: build fix Also fix up Kconfig dependencies and include files. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 3 ++- kernel/trace/kmemtrace.c | 2 ++ mm/slab.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 27fb74b06b3c..cc9f91e7daf4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -267,7 +267,8 @@ config HW_BRANCH_TRACER config KMEMTRACE bool "Trace SLAB allocations" select TRACING - depends on RELAY + select MARKERS + select RELAY help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index d69cbe3c2a4b..2bfdcd326226 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -296,6 +296,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, trace_wake_up(); } +EXPORT_SYMBOL(kmemtrace_mark_alloc_node); void kmemtrace_mark_free(enum kmemtrace_type_id type_id, unsigned long call_site, @@ -325,6 +326,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id, trace_wake_up(); } +EXPORT_SYMBOL(kmemtrace_mark_free); static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", diff --git a/mm/slab.c b/mm/slab.c index bcf08ea88380..7f72bb386a09 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 195ec037ff8f6fa800616e0dad8d57a98b6fb37e Mon Sep 17 00:00:00 2001 From: Lachlan McIlroy Date: Wed, 31 Dec 2008 12:10:12 +1100 Subject: [XFS] Update maintainers New maintainer contact and new tree location. Reviewed-by: Bill O`Donnell Signed-off-by: Lachlan McIlroy --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 08d0ab7fa161..d65048a81ecd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4789,11 +4789,11 @@ S: Supported XFS FILESYSTEM P: Silicon Graphics Inc -P: Tim Shimmin +P: Bill O'Donnell M: xfs-masters@oss.sgi.com L: xfs@oss.sgi.com W: http://oss.sgi.com/projects/xfs -T: git git://oss.sgi.com:8090/xfs/xfs-2.6.git +T: git://oss.sgi.com/xfs/xfs.git S: Supported XILINX SYSTEMACE DRIVER -- cgit v1.2.3-59-g8ed1b From f09eac9034a4502cce558b0ec4bf7d422b8b355b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 31 Dec 2008 09:43:46 +0100 Subject: tracing/kmemtrace: fix typo Impact: build fix Signed-off-by: Ingo Molnar --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 83075f36df7b..c65c52dc78d4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From d2859751cd0bf586941ffa7308635a293f943c17 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:40:44 +1100 Subject: [XFS] remove old vmap cache XFS's vmap batching simply defers a number (up to 64) of vunmaps, and keeps track of them in a list. To purge the batch, it just goes through the list and calls vunamp on each one. This is pretty poor: a global TLB flush is generally still performed on each vunmap, with the most expensive parts of the operation being the broadcast IPIs and locking involved in the SMP callouts, and the locking involved in the vmap management -- none of these are avoided by just batching up the calls. I'm actually surprised it ever made much difference. (Now that the lazy vmap allocator is upstream, this description is not quite right, but the vunmap batching still doesn't seem to do much) Rip all this logic out of XFS completely. I will improve vmap performance and scalability directly in subsequent patch. Signed-off-by: Nick Piggin Reviewed-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_buf.c | 75 +--------------------------------------------- 1 file changed, 1 insertion(+), 74 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925b..0b2177a9fbdc 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -165,75 +165,6 @@ test_page_region( return (mask && (page_private(page) & mask) == mask); } -/* - * Mapping of multi-page buffers into contiguous virtual space - */ - -typedef struct a_list { - void *vm_addr; - struct a_list *next; -} a_list_t; - -static a_list_t *as_free_head; -static int as_list_len; -static DEFINE_SPINLOCK(as_lock); - -/* - * Try to batch vunmaps because they are costly. - */ -STATIC void -free_address( - void *addr) -{ - a_list_t *aentry; - -#ifdef CONFIG_XEN - /* - * Xen needs to be able to make sure it can get an exclusive - * RO mapping of pages it wants to turn into a pagetable. If - * a newly allocated page is also still being vmap()ed by xfs, - * it will cause pagetable construction to fail. This is a - * quick workaround to always eagerly unmap pages so that Xen - * is happy. - */ - vunmap(addr); - return; -#endif - - aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); - if (likely(aentry)) { - spin_lock(&as_lock); - aentry->next = as_free_head; - aentry->vm_addr = addr; - as_free_head = aentry; - as_list_len++; - spin_unlock(&as_lock); - } else { - vunmap(addr); - } -} - -STATIC void -purge_addresses(void) -{ - a_list_t *aentry, *old; - - if (as_free_head == NULL) - return; - - spin_lock(&as_lock); - aentry = as_free_head; - as_free_head = NULL; - as_list_len = 0; - spin_unlock(&as_lock); - - while ((old = aentry) != NULL) { - vunmap(aentry->vm_addr); - aentry = aentry->next; - kfree(old); - } -} - /* * Internal xfs_buf_t object manipulation */ @@ -333,7 +264,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - free_address(bp->b_addr - bp->b_offset); + vunmap(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -455,8 +386,6 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - if (as_list_len > 64) - purge_addresses(); bp->b_addr = vmap(bp->b_pages, bp->b_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) @@ -1743,8 +1672,6 @@ xfsbufd( count++; } - if (as_list_len > 0) - purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); -- cgit v1.2.3-59-g8ed1b From 95f8e302c04c0b0c6de35ab399a5551605eeb006 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 6 Jan 2009 14:43:09 +1100 Subject: [XFS] use scalable vmap API Implement XFS's large buffer support with the new vmap APIs. See the vmap rewrite (db64fe02) for some numbers. The biggest improvement that comes from using the new APIs is avoiding the global KVA allocation lock on every call. Signed-off-by: Nick Piggin Reviewed-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 0b2177a9fbdc..d71dc44e21ed 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -264,7 +264,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vunmap(bp->b_addr - bp->b_offset); + vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,8 +386,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vmap(bp->b_pages, bp->b_page_count, - VM_MAP, PAGE_KERNEL); + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; -- cgit v1.2.3-59-g8ed1b From 723cbe0775514853c22dc45005af59c360916af1 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 5 Jan 2009 22:09:58 +0200 Subject: kmemtrace: Remove the relay version of kmemtrace Impact: cleanup kmemtrace now uses ftrace. This patch removes the relay version. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 2 - mm/Makefile | 1 - mm/kmemtrace.c | 333 --------------------------------------------------- 3 files changed, 336 deletions(-) delete mode 100644 mm/kmemtrace.c diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cc9f91e7daf4..1c0b7504cab3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -267,8 +267,6 @@ config HW_BRANCH_TRACER config KMEMTRACE bool "Trace SLAB allocations" select TRACING - select MARKERS - select RELAY help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected diff --git a/mm/Makefile b/mm/Makefile index c92e8af13206..51c27709cc7c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -35,4 +35,3 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c deleted file mode 100644 index 0573b5080cc4..000000000000 --- a/mm/kmemtrace.c +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define KMEMTRACE_SUBBUF_SIZE 524288 -#define KMEMTRACE_DEF_N_SUBBUFS 20 - -static struct rchan *kmemtrace_chan; -static u32 kmemtrace_buf_overruns; - -static unsigned int kmemtrace_n_subbufs; - -/* disabled by default */ -static unsigned int kmemtrace_enabled; - -/* - * The sequence number is used for reordering kmemtrace packets - * in userspace, since they are logged as per-CPU data. - * - * atomic_t should always be a 32-bit signed integer. Wraparound is not - * likely to occur, but userspace can deal with it by expecting a certain - * sequence number in the next packet that will be read. - */ -static atomic_t kmemtrace_seq_num; - -#define KMEMTRACE_ABI_VERSION 1 - -static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION; - -enum kmemtrace_event_id { - KMEMTRACE_EVENT_ALLOC = 0, - KMEMTRACE_EVENT_FREE, -}; - -struct kmemtrace_event { - u8 event_id; - u8 type_id; - u16 event_size; - s32 seq_num; - u64 call_site; - u64 ptr; -} __attribute__ ((__packed__)); - -struct kmemtrace_stats_alloc { - u64 bytes_req; - u64 bytes_alloc; - u32 gfp_flags; - s32 numa_node; -} __attribute__ ((__packed__)); - -static void kmemtrace_probe_alloc(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - unsigned long flags; - struct kmemtrace_event *ev; - struct kmemtrace_stats_alloc *stats; - void *buf; - - local_irq_save(flags); - - buf = relay_reserve(kmemtrace_chan, - sizeof(struct kmemtrace_event) + - sizeof(struct kmemtrace_stats_alloc)); - if (!buf) - goto failed; - - /* - * Don't convert this to use structure initializers, - * C99 does not guarantee the rvalues evaluation order. - */ - - ev = buf; - ev->event_id = KMEMTRACE_EVENT_ALLOC; - ev->type_id = va_arg(*args, int); - ev->event_size = sizeof(struct kmemtrace_event) + - sizeof(struct kmemtrace_stats_alloc); - ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); - ev->call_site = va_arg(*args, unsigned long); - ev->ptr = va_arg(*args, unsigned long); - - stats = buf + sizeof(struct kmemtrace_event); - stats->bytes_req = va_arg(*args, unsigned long); - stats->bytes_alloc = va_arg(*args, unsigned long); - stats->gfp_flags = va_arg(*args, unsigned long); - stats->numa_node = va_arg(*args, int); - -failed: - local_irq_restore(flags); -} - -static void kmemtrace_probe_free(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - unsigned long flags; - struct kmemtrace_event *ev; - - local_irq_save(flags); - - ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event)); - if (!ev) - goto failed; - - /* - * Don't convert this to use structure initializers, - * C99 does not guarantee the rvalues evaluation order. - */ - ev->event_id = KMEMTRACE_EVENT_FREE; - ev->type_id = va_arg(*args, int); - ev->event_size = sizeof(struct kmemtrace_event); - ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num); - ev->call_site = va_arg(*args, unsigned long); - ev->ptr = va_arg(*args, unsigned long); - -failed: - local_irq_restore(flags); -} - -static struct dentry * -kmemtrace_create_buf_file(const char *filename, struct dentry *parent, - int mode, struct rchan_buf *buf, int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static int kmemtrace_remove_buf_file(struct dentry *dentry) -{ - debugfs_remove(dentry); - - return 0; -} - -static int kmemtrace_subbuf_start(struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - size_t prev_padding) -{ - if (relay_buf_full(buf)) { - /* - * We know it's not SMP-safe, but neither - * debugfs_create_u32() is. - */ - kmemtrace_buf_overruns++; - return 0; - } - - return 1; -} - -static struct rchan_callbacks relay_callbacks = { - .create_buf_file = kmemtrace_create_buf_file, - .remove_buf_file = kmemtrace_remove_buf_file, - .subbuf_start = kmemtrace_subbuf_start, -}; - -static struct dentry *kmemtrace_dir; -static struct dentry *kmemtrace_overruns_dentry; -static struct dentry *kmemtrace_abi_version_dentry; - -static struct dentry *kmemtrace_enabled_dentry; - -static int kmemtrace_start_probes(void) -{ - int err; - - err = marker_probe_register("kmemtrace_alloc", "type_id %d " - "call_site %lu ptr %lu " - "bytes_req %lu bytes_alloc %lu " - "gfp_flags %lu node %d", - kmemtrace_probe_alloc, NULL); - if (err) - return err; - err = marker_probe_register("kmemtrace_free", "type_id %d " - "call_site %lu ptr %lu", - kmemtrace_probe_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - marker_probe_unregister("kmemtrace_alloc", - kmemtrace_probe_alloc, NULL); - marker_probe_unregister("kmemtrace_free", - kmemtrace_probe_free, NULL); -} - -static int kmemtrace_enabled_get(void *data, u64 *val) -{ - *val = *((int *) data); - - return 0; -} - -static int kmemtrace_enabled_set(void *data, u64 val) -{ - u64 old_val = kmemtrace_enabled; - - *((int *) data) = !!val; - - if (old_val == val) - return 0; - if (val) - kmemtrace_start_probes(); - else - kmemtrace_stop_probes(); - - return 0; -} - -DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops, - kmemtrace_enabled_get, - kmemtrace_enabled_set, "%llu\n"); - -static void kmemtrace_cleanup(void) -{ - if (kmemtrace_enabled_dentry) - debugfs_remove(kmemtrace_enabled_dentry); - - kmemtrace_stop_probes(); - - if (kmemtrace_abi_version_dentry) - debugfs_remove(kmemtrace_abi_version_dentry); - if (kmemtrace_overruns_dentry) - debugfs_remove(kmemtrace_overruns_dentry); - - relay_close(kmemtrace_chan); - kmemtrace_chan = NULL; - - if (kmemtrace_dir) - debugfs_remove(kmemtrace_dir); -} - -static int __init kmemtrace_setup_late(void) -{ - if (!kmemtrace_chan) - goto failed; - - kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL); - if (!kmemtrace_dir) - goto cleanup; - - kmemtrace_abi_version_dentry = - debugfs_create_u32("abi_version", S_IRUSR, - kmemtrace_dir, &kmemtrace_abi_version); - kmemtrace_overruns_dentry = - debugfs_create_u32("total_overruns", S_IRUSR, - kmemtrace_dir, &kmemtrace_buf_overruns); - if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry) - goto cleanup; - - kmemtrace_enabled_dentry = - debugfs_create_file("enabled", S_IRUSR | S_IWUSR, - kmemtrace_dir, &kmemtrace_enabled, - &kmemtrace_enabled_fops); - if (!kmemtrace_enabled_dentry) - goto cleanup; - - if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir)) - goto cleanup; - - printk(KERN_INFO "kmemtrace: fully up.\n"); - - return 0; - -cleanup: - kmemtrace_cleanup(); -failed: - return 1; -} -late_initcall(kmemtrace_setup_late); - -static int __init kmemtrace_set_boot_enabled(char *str) -{ - if (!str) - return -EINVAL; - - if (!strcmp(str, "yes")) - kmemtrace_enabled = 1; - else if (!strcmp(str, "no")) - kmemtrace_enabled = 0; - else - return -EINVAL; - - return 0; -} -early_param("kmemtrace.enable", kmemtrace_set_boot_enabled); - -static int __init kmemtrace_set_subbufs(char *str) -{ - get_option(&str, &kmemtrace_n_subbufs); - return 0; -} -early_param("kmemtrace.subbufs", kmemtrace_set_subbufs); - -void kmemtrace_init(void) -{ - if (!kmemtrace_n_subbufs) - kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS; - - kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE, - kmemtrace_n_subbufs, &relay_callbacks, - NULL); - if (!kmemtrace_chan) { - printk(KERN_ERR "kmemtrace: could not open relay channel.\n"); - return; - } - - if (!kmemtrace_enabled) { - printk(KERN_INFO "kmemtrace: disabled. Pass " - "kemtrace.enable=yes as kernel parameter for " - "boot-time tracing.\n"); - return; - } - if (kmemtrace_start_probes()) { - printk(KERN_ERR "kmemtrace: could not register marker probes!\n"); - kmemtrace_cleanup(); - return; - } - - printk(KERN_INFO "kmemtrace: enabled.\n"); -} - -- cgit v1.2.3-59-g8ed1b From 3e80680208ba6ce9635ca7c21ad0019442ea166a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 6 Jan 2009 10:16:35 +0100 Subject: kmemtrace: add kmemtrace_init() Impact: build fix leftover from the relayfs version - but we want to keep it because this call is the earliest opportunity when we can start kmemtrace tracing. (after kmem_cache_init()). Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 2bfdcd326226..faaa5ae7e75a 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -337,6 +337,11 @@ static struct tracer kmem_tracer __read_mostly = { .flags = &kmem_tracer_flags }; +void kmemtrace_init(void) +{ + /* earliest opportunity to start kmem tracing */ +} + static int __init init_kmem_tracer(void) { return register_tracer(&kmem_tracer); -- cgit v1.2.3-59-g8ed1b From 431aa3fbf5bbe3be79809c7e603c2ed2ac64b015 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 Jan 2009 12:43:01 -0500 Subject: ftrace: convert unsigned index to signed Impact: fix to unsigned compared to less than zero Roel Kluin pointed out that there is a compare of an unsigned number to less than zero. A previous clean up had the unsigned index set to -1 for certain cases, but never converted it to signed. Frederic Weisbecker noticed that another index is used to compare the above index to and it also needs to be converted to signed. [ Converted ftrace_page->index to int from unsigned long as Andrew Morton pointed out that there's no need for it to stay a long. ] Reported-by: Roel Kluin Reported-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2f32969c09df..9e54a6ccdb93 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -289,7 +289,7 @@ static DEFINE_MUTEX(ftrace_regex_lock); struct ftrace_page { struct ftrace_page *next; - unsigned long index; + int index; struct dyn_ftrace records[]; }; @@ -786,7 +786,7 @@ enum { struct ftrace_iterator { struct ftrace_page *pg; - unsigned idx; + int idx; unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; unsigned buffer_idx; -- cgit v1.2.3-59-g8ed1b From ff288b274a9b383046fdbda4be3067daba4d5fe8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 6 Jan 2009 21:33:30 +0100 Subject: tracing/ftrace: fix a memory leak in stat tracing Impact: fix memory leak This patch fixes a memory leak inside reset_stat_list(). The freeing loop iterated only once. Also turn the stat_list into a simple struct list_head, which simplify the code and avoid an unused static pointer. Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6f194a33a64a..4cb4ff27646d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -21,7 +21,7 @@ struct trace_stat_list { void *stat; }; -static struct trace_stat_list stat_list; +static LIST_HEAD(stat_list); /* * This is a copy of the current tracer to avoid racy @@ -39,22 +39,12 @@ static DEFINE_MUTEX(stat_list_mutex); static void reset_stat_list(void) { - struct trace_stat_list *node; - struct list_head *next; + struct trace_stat_list *node, *next; - if (list_empty(&stat_list.list)) - return; - - node = list_entry(stat_list.list.next, struct trace_stat_list, list); - next = node->list.next; - - while (&node->list != next) { + list_for_each_entry_safe(node, next, &stat_list, list) kfree(node); - node = list_entry(next, struct trace_stat_list, list); - } - kfree(node); - INIT_LIST_HEAD(&stat_list.list); + INIT_LIST_HEAD(&stat_list); } void init_tracer_stat(struct tracer *trace) @@ -107,7 +97,7 @@ static int stat_seq_init(void) } INIT_LIST_HEAD(&new_entry->list); - list_add(&new_entry->list, &stat_list.list); + list_add(&new_entry->list, &stat_list); new_entry->stat = current_tracer.stat_start(); prev_stat = new_entry->stat; @@ -130,7 +120,7 @@ static int stat_seq_init(void) if (!new_entry->stat) break; - list_for_each_entry(iter_entry, &stat_list.list, list) { + list_for_each_entry(iter_entry, &stat_list, list) { /* Insertion with a descendent sorting */ if (current_tracer.stat_cmp(new_entry->stat, iter_entry->stat) > 0) { @@ -141,7 +131,7 @@ static int stat_seq_init(void) /* The current smaller value */ } else if (list_is_last(&iter_entry->list, - &stat_list.list)) { + &stat_list)) { list_add(&new_entry->list, &iter_entry->list); break; } @@ -162,7 +152,7 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct trace_stat_list *l = (struct trace_stat_list *)s->private; + struct list_head *l = (struct list_head *)s->private; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&stat_list_mutex); @@ -171,14 +161,14 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) if (!*pos && current_tracer.stat_headers) current_tracer.stat_headers(s); - return seq_list_start(&l->list, *pos); + return seq_list_start(l, *pos); } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct trace_stat_list *l = (struct trace_stat_list *)s->private; + struct list_head *l = (struct list_head *)s->private; - return seq_list_next(p, &l->list, pos); + return seq_list_next(p, l, pos); } static void stat_seq_stop(struct seq_file *m, void *p) @@ -188,8 +178,10 @@ static void stat_seq_stop(struct seq_file *m, void *p) static int stat_seq_show(struct seq_file *s, void *v) { - struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); - return current_tracer.stat_show(s, l->stat); + struct trace_stat_list *entry = + list_entry(v, struct trace_stat_list, list); + + return current_tracer.stat_show(s, entry->stat); } static const struct seq_operations trace_stat_seq_ops = { @@ -237,7 +229,6 @@ static int __init tracing_stat_init(void) struct dentry *d_tracing; struct dentry *entry; - INIT_LIST_HEAD(&stat_list.list); d_tracing = tracing_init_dentry(); entry = debugfs_create_file("trace_stat", 0444, d_tracing, -- cgit v1.2.3-59-g8ed1b From e8a9cbf6ae620d9e5ba9cb42001c033287a284a3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 6 Jan 2009 22:02:35 -0500 Subject: trace: clean up funny line breaks in stat_seq_show Impact: clean up Andrew Morton pointed out that the entry assignment in stat_seq_show did not need to be done in the declaration, causing funny line breaks. This patch makes it a bit more pleasing on the eyes. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 4cb4ff27646d..f110ce9ce7fb 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -178,8 +178,9 @@ static void stat_seq_stop(struct seq_file *m, void *p) static int stat_seq_show(struct seq_file *s, void *v) { - struct trace_stat_list *entry = - list_entry(v, struct trace_stat_list, list); + struct trace_stat_list *entry; + + entry = list_entry(v, struct trace_stat_list, list); return current_tracer.stat_show(s, entry->stat); } -- cgit v1.2.3-59-g8ed1b From 34a148bf0911a4a1cae85f8ecba57affb4d76aee Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:09 -0800 Subject: kernel/trace/ring_buffer.c: reduce inlining text data bss dec hex filename before: 11320 228 8 11556 2d24 kernel/trace/ring_buffer.o after: 10592 228 8 10828 2a4c kernel/trace/ring_buffer.o Also: free_page(0) is legal. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8b0daf0662ef..9542990f515b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -133,7 +133,7 @@ enum { }; /* inline for ring buffer fast paths */ -static inline unsigned +static unsigned rb_event_length(struct ring_buffer_event *event) { unsigned length; @@ -179,7 +179,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static inline void * +static void * rb_event_data(struct ring_buffer_event *event) { BUG_ON(event->type != RINGBUF_TYPE_DATA); @@ -229,10 +229,9 @@ static void rb_init_page(struct buffer_data_page *bpage) * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. */ -static inline void free_buffer_page(struct buffer_page *bpage) +static void free_buffer_page(struct buffer_page *bpage) { - if (bpage->page) - free_page((unsigned long)bpage->page); + free_page((unsigned long)bpage->page); kfree(bpage); } @@ -811,7 +810,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static inline int +static int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -825,7 +824,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static inline void +static void rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -850,7 +849,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, local_set(&cpu_buffer->commit_page->page->commit, index); } -static inline void +static void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { /* @@ -896,7 +895,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; } -static inline void rb_inc_iter(struct ring_buffer_iter *iter) +static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -926,7 +925,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter) * and with this, we can determine what to place into the * data field. */ -static inline void +static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { @@ -964,7 +963,7 @@ rb_update_event(struct ring_buffer_event *event, } } -static inline unsigned rb_calculate_event_length(unsigned length) +static unsigned rb_calculate_event_length(unsigned length) { struct ring_buffer_event event; /* Used only for sizeof array */ @@ -1438,7 +1437,7 @@ int ring_buffer_write(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = cpu_buffer->reader_page; struct buffer_page *head = cpu_buffer->head_page; -- cgit v1.2.3-59-g8ed1b From 67d347245f76a149c45bffb1a10145d31d61d1da Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 9 Jan 2009 12:27:09 -0800 Subject: kernel/trace/ring_buffer.c: use DIV_ROUND_UP Instead of open-coding it. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9542990f515b..4832ffa5d937 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) -#define RB_ALIGNMENT_SHIFT 2 -#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 enum { @@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event) case RINGBUF_TYPE_DATA: if (event->len) - length = event->len << RB_ALIGNMENT_SHIFT; + length = event->len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -937,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event, break; case RINGBUF_TYPE_TIME_EXTEND: - event->len = - (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); break; case RINGBUF_TYPE_TIME_STAMP: - event->len = - (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; case RINGBUF_TYPE_DATA: @@ -954,9 +949,7 @@ rb_update_event(struct ring_buffer_event *event, event->len = 0; event->array[0] = length; } else - event->len = - (length + (RB_ALIGNMENT-1)) - >> RB_ALIGNMENT_SHIFT; + event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); -- cgit v1.2.3-59-g8ed1b From 034939b65ad5ff64b9709210b3469a95153c51a3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 8 Jan 2009 10:03:56 -0800 Subject: tracing/ftrace: handle more than one stat file per tracer Impact: new API for tracers Make the stat tracing API reentrant. And also provide the new directory /debugfs/tracing/trace_stat which will contain all the stat files for the current active tracer. Now a tracer will, if desired, want to provide a zero terminated array of tracer_stat structures. Each one contains the callbacks necessary for one stat file. It have to provide at least a name for its stat file, an iterator with stat_start/start_next callback and an output callback for one stat entry. Also adapt the branch tracer to this new API. We create two files "all" and "annotated" inside the /debugfs/tracing/trace_stat directory, making the both stats simultaneously available instead of needing to change an option to switch from one stat file to another. The output of these stats haven't changed. Changes in v2: _ Apply the previous memory leak fix (rebase against tip/master) Changes in v3: _ Merge the patch that adapted the branch tracer to this Api in this patch to not break the kernel build. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 35 ++++--- kernel/trace/trace_branch.c | 69 ++++++------- kernel/trace/trace_stat.c | 230 ++++++++++++++++++++++++++++++++------------ 3 files changed, 217 insertions(+), 117 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 94ed45e93a80..b3f9ad1b4d84 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,6 +334,25 @@ struct tracer_flags { /* Makes more easy to define a tracer opt */ #define TRACER_OPT(s, b) .name = #s, .bit = b +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(void); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for sorting (optional) for stats */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -361,21 +380,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; - - /* - * If you change one of the following on tracing runtime, recall - * init_tracer_stat() - */ - - /* Iteration over statistic entries */ - void *(*stat_start)(void); - void *(*stat_next)(void *prev, int idx); - /* Compare two entries for sorting (optional) for stats */ - int (*stat_cmp)(void *p1, void *p2); - /* Print a stat entry */ - int (*stat_show)(struct seq_file *s, void *p); - /* Print the headers of your stat entries */ - int (*stat_headers)(struct seq_file *s); + struct tracer_stat *stats; }; struct trace_seq { diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4785a3b9bc4a..da5cf3e5581b 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -306,19 +306,6 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) } #ifdef CONFIG_PROFILE_ALL_BRANCHES -enum { - TRACE_BRANCH_OPT_ALL = 0x1 -}; - -static struct tracer_opt branch_opts[] = { - { TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) }, - { } -}; - -static struct tracer_flags branch_flags = { - .val = 0, - .opts = branch_opts -}; extern unsigned long __start_branch_profile[]; extern unsigned long __stop_branch_profile[]; @@ -352,28 +339,36 @@ all_branch_stat_next(void *v, int idx) return p; } -static int branch_set_flag(u32 old_flags, u32 bit, int set) -{ - if (bit == TRACE_BRANCH_OPT_ALL) { - if (set) { - branch_trace.stat_headers = all_branch_stat_headers; - branch_trace.stat_start = all_branch_stat_start; - branch_trace.stat_next = all_branch_stat_next; - branch_trace.stat_cmp = NULL; - } else { - branch_trace.stat_headers = - annotated_branch_stat_headers; - branch_trace.stat_start = annotated_branch_stat_start; - branch_trace.stat_next = annotated_branch_stat_next; - branch_trace.stat_cmp = annotated_branch_stat_cmp; - } - init_tracer_stat(&branch_trace); - } - return 0; -} +static struct tracer_stat branch_stats[] = { + {.name = "annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show}, + {.name = "all", + .stat_start = all_branch_stat_start, + .stat_next = all_branch_stat_next, + .stat_headers = all_branch_stat_headers, + .stat_show = branch_stat_show}, + + { } +}; +#else +static struct tracer_stat branch_stats[] = { + {.name = "annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show}, + + { } +}; #endif /* CONFIG_PROFILE_ALL_BRANCHES */ + static struct tracer branch_trace __read_mostly = { .name = "branch", @@ -383,16 +378,8 @@ static struct tracer branch_trace __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_branch, #endif /* CONFIG_FTRACE_SELFTEST */ -#endif /* CONFIG_BRANCH_TRACER */ - .stat_start = annotated_branch_stat_start, - .stat_next = annotated_branch_stat_next, - .stat_show = branch_stat_show, - .stat_headers = annotated_branch_stat_headers, - .stat_cmp = annotated_branch_stat_cmp, -#ifdef CONFIG_PROFILE_ALL_BRANCHES - .flags = &branch_flags, - .set_flag = branch_set_flag, #endif + .stats = branch_stats }; __init static int init_branch_trace(void) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f110ce9ce7fb..1515f9e7adfc 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -21,37 +21,87 @@ struct trace_stat_list { void *stat; }; -static LIST_HEAD(stat_list); - -/* - * This is a copy of the current tracer to avoid racy - * and dangerous output while the current tracer is - * switched. - */ -static struct tracer current_tracer; +/* A stat session is the stats output in one file */ +struct tracer_stat_session { + struct tracer_stat *ts; + struct list_head stat_list; + struct mutex stat_mutex; +}; -/* - * Protect both the current tracer and the global - * stat list. - */ -static DEFINE_MUTEX(stat_list_mutex); +/* All of the sessions currently in use. Each stat file embeed one session */ +static struct tracer_stat_session **all_stat_sessions; +static int nb_sessions; +static struct dentry *stat_dir, **stat_files; -static void reset_stat_list(void) +static void reset_stat_session(struct tracer_stat_session *session) { struct trace_stat_list *node, *next; - list_for_each_entry_safe(node, next, &stat_list, list) + list_for_each_entry_safe(node, next, &session->stat_list, list) kfree(node); - INIT_LIST_HEAD(&stat_list); + INIT_LIST_HEAD(&session->stat_list); } -void init_tracer_stat(struct tracer *trace) +/* Called when a tracer is initialized */ +static int init_all_sessions(int nb, struct tracer_stat *ts) { - mutex_lock(&stat_list_mutex); - current_tracer = *trace; - mutex_unlock(&stat_list_mutex); + int i, j; + struct tracer_stat_session *session; + + nb_sessions = 0; + + if (all_stat_sessions) { + for (i = 0; i < nb_sessions; i++) { + session = all_stat_sessions[i]; + reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); + } + } + all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb, + GFP_KERNEL); + if (!all_stat_sessions) + return -ENOMEM; + + for (i = 0; i < nb; i++) { + session = kmalloc(sizeof(struct tracer_stat_session) * nb, + GFP_KERNEL); + if (!session) + goto free_sessions; + + INIT_LIST_HEAD(&session->stat_list); + mutex_init(&session->stat_mutex); + session->ts = &ts[i]; + all_stat_sessions[i] = session; + } + nb_sessions = nb; + return 0; + +free_sessions: + + for (j = 0; j < i; j++) + kfree(all_stat_sessions[i]); + + kfree(all_stat_sessions); + all_stat_sessions = NULL; + + return -ENOMEM; +} + +static int basic_tracer_stat_checks(struct tracer_stat *ts) +{ + int i; + + if (!ts) + return 0; + + for (i = 0; ts[i].name; i++) { + if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show) + return -EBUSY; + } + return i; } /* @@ -69,22 +119,19 @@ static int dummy_cmp(void *p1, void *p2) * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ -static int stat_seq_init(void) +static int stat_seq_init(struct tracer_stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; + struct tracer_stat *ts = session->ts; void *prev_stat; int ret = 0; int i; - mutex_lock(&stat_list_mutex); - reset_stat_list(); - - if (!current_tracer.stat_start || !current_tracer.stat_next || - !current_tracer.stat_show) - goto exit; + mutex_lock(&session->stat_mutex); + reset_stat_session(session); - if (!current_tracer.stat_cmp) - current_tracer.stat_cmp = dummy_cmp; + if (!ts->stat_cmp) + ts->stat_cmp = dummy_cmp; /* * The first entry. Actually this is the second, but the first @@ -97,9 +144,10 @@ static int stat_seq_init(void) } INIT_LIST_HEAD(&new_entry->list); - list_add(&new_entry->list, &stat_list); - new_entry->stat = current_tracer.stat_start(); + list_add(&new_entry->list, &session->stat_list); + + new_entry->stat = ts->stat_start(); prev_stat = new_entry->stat; /* @@ -114,15 +162,16 @@ static int stat_seq_init(void) } INIT_LIST_HEAD(&new_entry->list); - new_entry->stat = current_tracer.stat_next(prev_stat, i); + new_entry->stat = ts->stat_next(prev_stat, i); /* End of insertion */ if (!new_entry->stat) break; - list_for_each_entry(iter_entry, &stat_list, list) { + list_for_each_entry(iter_entry, &session->stat_list, list) { + /* Insertion with a descendent sorting */ - if (current_tracer.stat_cmp(new_entry->stat, + if (ts->stat_cmp(new_entry->stat, iter_entry->stat) > 0) { list_add_tail(&new_entry->list, @@ -131,7 +180,7 @@ static int stat_seq_init(void) /* The current smaller value */ } else if (list_is_last(&iter_entry->list, - &stat_list)) { + &session->stat_list)) { list_add(&new_entry->list, &iter_entry->list); break; } @@ -140,49 +189,49 @@ static int stat_seq_init(void) prev_stat = new_entry->stat; } exit: - mutex_unlock(&stat_list_mutex); + mutex_unlock(&session->stat_mutex); return ret; exit_free_list: - reset_stat_list(); - mutex_unlock(&stat_list_mutex); + reset_stat_session(session); + mutex_unlock(&session->stat_mutex); return ret; } static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct list_head *l = (struct list_head *)s->private; + struct tracer_stat_session *session = s->private; /* Prevent from tracer switch or stat_list modification */ - mutex_lock(&stat_list_mutex); + mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && current_tracer.stat_headers) - current_tracer.stat_headers(s); + if (!*pos && session->ts->stat_headers) + session->ts->stat_headers(s); - return seq_list_start(l, *pos); + return seq_list_start(&session->stat_list, *pos); } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct list_head *l = (struct list_head *)s->private; + struct tracer_stat_session *session = s->private; - return seq_list_next(p, l, pos); + return seq_list_next(p, &session->stat_list, pos); } -static void stat_seq_stop(struct seq_file *m, void *p) +static void stat_seq_stop(struct seq_file *s, void *p) { - mutex_unlock(&stat_list_mutex); + struct tracer_stat_session *session = s->private; + mutex_unlock(&session->stat_mutex); } static int stat_seq_show(struct seq_file *s, void *v) { - struct trace_stat_list *entry; - - entry = list_entry(v, struct trace_stat_list, list); + struct tracer_stat_session *session = s->private; + struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); - return current_tracer.stat_show(s, entry->stat); + return session->ts->stat_show(s, l->stat); } static const struct seq_operations trace_stat_seq_ops = { @@ -192,15 +241,18 @@ static const struct seq_operations trace_stat_seq_ops = { .show = stat_seq_show }; +/* The session stat is refilled and resorted at each stat file opening */ static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; + struct tracer_stat_session *session = inode->i_private; + ret = seq_open(file, &trace_stat_seq_ops); if (!ret) { struct seq_file *m = file->private_data; - m->private = &stat_list; - ret = stat_seq_init(); + m->private = session; + ret = stat_seq_init(session); } return ret; @@ -212,9 +264,12 @@ static int tracing_stat_open(struct inode *inode, struct file *file) */ static int tracing_stat_release(struct inode *i, struct file *f) { - mutex_lock(&stat_list_mutex); - reset_stat_list(); - mutex_unlock(&stat_list_mutex); + struct tracer_stat_session *session = i->i_private; + + mutex_lock(&session->stat_mutex); + reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + return 0; } @@ -225,17 +280,70 @@ static const struct file_operations tracing_stat_fops = { .release = tracing_stat_release }; + +static void destroy_trace_stat_files(void) +{ + int i; + + if (stat_files) { + for (i = 0; i < nb_sessions; i++) + debugfs_remove(stat_files[i]); + kfree(stat_files); + stat_files = NULL; + } +} + +static void init_trace_stat_files(void) +{ + int i; + + if (!stat_dir || !nb_sessions) + return; + + stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL); + + if (!stat_files) { + pr_warning("trace stat: not enough memory\n"); + return; + } + + for (i = 0; i < nb_sessions; i++) { + struct tracer_stat_session *session = all_stat_sessions[i]; + stat_files[i] = debugfs_create_file(session->ts->name, 0644, + stat_dir, + session, &tracing_stat_fops); + if (!stat_files[i]) + pr_warning("cannot create %s entry\n", + session->ts->name); + } +} + +void init_tracer_stat(struct tracer *trace) +{ + int nb = basic_tracer_stat_checks(trace->stats); + + destroy_trace_stat_files(); + + if (nb < 0) { + pr_warning("stat tracing: missing stat callback on %s\n", + trace->name); + return; + } + if (!nb) + return; + + init_all_sessions(nb, trace->stats); + init_trace_stat_files(); +} + static int __init tracing_stat_init(void) { struct dentry *d_tracing; - struct dentry *entry; d_tracing = tracing_init_dentry(); - entry = debugfs_create_file("trace_stat", 0444, d_tracing, - NULL, - &tracing_stat_fops); - if (!entry) + stat_dir = debugfs_create_dir("trace_stat", d_tracing); + if (!stat_dir) pr_warning("Could not create debugfs " "'trace_stat' entry\n"); return 0; -- cgit v1.2.3-59-g8ed1b From 5d2ad3316e29ad218f98d66b9c0ce6d4bcd05b77 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:09:27 +0200 Subject: doc: mmiotrace.txt, buffer size control change Impact: prevents confusing the user when buffer size is inadequate The tracing framework offers a resizeable buffer, which mmiotrace uses to record events. If the buffer is full, the following events will be lost. Events should not be lost, so the documentation instructs the user to increase the buffer size. The buffer size is set via a debugfs file. Mmiotrace documentation was not updated the same time the debugfs file was changed. The old file was tracing/trace_entries and first contained the number of entries the buffer had space for, per cpu. Nowadays this file is replaced with the file tracing/buffer_size_kb, which tells the amount of memory reserved for the buffer, per cpu, in kilobytes. Previously, a flag had to be toggled via the debugfs file tracing/tracing_enabled when the buffer size was changed. This is no longer necessary. The mmiotrace documentation is updated to reflect the current state of the tracing framework. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/tracers/mmiotrace.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt index cde23b4a12a1..5731c67abc55 100644 --- a/Documentation/tracers/mmiotrace.txt +++ b/Documentation/tracers/mmiotrace.txt @@ -78,12 +78,10 @@ to view your kernel log and look for "mmiotrace has lost events" warning. If events were lost, the trace is incomplete. You should enlarge the buffers and try again. Buffers are enlarged by first seeing how large the current buffers are: -$ cat /debug/tracing/trace_entries +$ cat /debug/tracing/buffer_size_kb gives you a number. Approximately double this number and write it back, for instance: -$ echo 0 > /debug/tracing/tracing_enabled -$ echo 128000 > /debug/tracing/trace_entries -$ echo 1 > /debug/tracing/tracing_enabled +$ echo 128000 > /debug/tracing/buffer_size_kb Then start again from the top. If you are doing a trace for a driver project, e.g. Nouveau, you should also -- cgit v1.2.3-59-g8ed1b From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sat, 3 Jan 2009 21:23:51 +0200 Subject: trace: mmiotrace to the tracer menu in Kconfig Impact: cosmetic change in Kconfig menu layout This patch was originally suggested by Peter Zijlstra, but seems it was forgotten. CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable directly under the Kernel hacking / debugging menu in the kernel configuration system. They were present only for x86 and x86_64. Other tracers that use the ftrace tracing framework are in their own sub-menu. This patch moves the mmiotrace configuration options there. Since the Kconfig file, where the tracer menu is, is not architecture specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by x86/x86_64. CONFIG_MMIOTRACE now depends on it. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 24 ++---------------------- kernel/trace/Kconfig | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 10d6cc3fd052..e1983fa025d2 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -174,28 +174,8 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE - bool "Memory mapped IO tracing" - depends on DEBUG_KERNEL && PCI - select TRACING - help - Mmiotrace traces Memory Mapped I/O access and is meant for - debugging and reverse engineering. It is called from the ioremap - implementation and works via page faults. Tracing is disabled by - default and can be enabled at run-time. - - See Documentation/tracers/mmiotrace.txt. - If you are not helping to develop drivers, say N. - -config MMIOTRACE_TEST - tristate "Test module for mmiotrace" - depends on MMIOTRACE && m - help - This is a dumb module for testing mmiotrace. It is very dangerous - as it will write garbage to IO memory starting at a given address. - However, it should be safe to use on e.g. unused portion of VRAM. - - Say N, unless you absolutely know what you are doing. +config HAVE_MMIOTRACE_SUPPORT + def_bool y # # IO delay types: diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1c0b7504cab3..944239296f13 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST functioning properly. It will do tests on all the configured tracers of ftrace. +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + select TRACING + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/tracers/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + endmenu -- cgit v1.2.3-59-g8ed1b From 173ed24ee2d64f5de28654eb456ec1ee18a142e5 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 6 Jan 2009 13:57:11 +0200 Subject: mmiotrace: count events lost due to not recording Impact: enhances lost events counting in mmiotrace The tracing framework, or the ring buffer facility it uses, has a switch to stop recording data. When recording is off, the trace events will be lost. The framework does not count these, so mmiotrace has to count them itself. Signed-off-by: Pekka Paalanen Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fcec59ff2355..621c8c3f3139 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -20,6 +21,7 @@ struct header_iter { static struct trace_array *mmio_trace_array; static bool overrun_detected; static unsigned long prev_overruns; +static atomic_t dropped_count; static void mmio_reset_data(struct trace_array *tr) { @@ -122,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { - unsigned long cnt = 0; + unsigned long cnt = atomic_xchg(&dropped_count, 0); unsigned long over = ring_buffer_overruns(iter->tr->buffer); if (over > prev_overruns) - cnt = over - prev_overruns; + cnt += over - prev_overruns; prev_overruns = over; return cnt; } @@ -308,8 +310,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_RW; @@ -336,8 +340,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr, event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); - if (!event) + if (!event) { + atomic_inc(&dropped_count); return; + } entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_MAP; -- cgit v1.2.3-59-g8ed1b From 18c167fd6d8feec5d337bd8fbc3a17da4cc37652 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 12 Jan 2009 10:00:51 +0800 Subject: ftrace, ia64: make recordmcount distinct module compile In IA64, module build and kernel build use different option. Make recordmcount.pl differentiate the two cases. Signed-off-by: Shaohua Li Acked-by: Sam Ravnborg Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- scripts/Makefile.build | 13 +++++++------ scripts/recordmcount.pl | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 5d900307de3e..b5efa98994bb 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -112,13 +112,13 @@ endif # --------------------------------------------------------------------------- # Default is built-in, unless we know otherwise -modkern_cflags := $(CFLAGS_KERNEL) +modkern_cflags = $(if $(part-of-module), $(CFLAGS_MODULE), $(CFLAGS_KERNEL)) quiet_modtag := $(empty) $(empty) -$(real-objs-m) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.i) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.s) : modkern_cflags := $(CFLAGS_MODULE) -$(real-objs-m:.o=.lst): modkern_cflags := $(CFLAGS_MODULE) +$(real-objs-m) : part-of-module := y +$(real-objs-m:.o=.i) : part-of-module := y +$(real-objs-m:.o=.s) : part-of-module := y +$(real-objs-m:.o=.lst): part-of-module := y $(real-objs-m) : quiet_modtag := [M] $(real-objs-m:.o=.i) : quiet_modtag := [M] @@ -215,7 +215,8 @@ endif ifdef CONFIG_FTRACE_MCOUNT_RECORD cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \ "$(if $(CONFIG_64BIT),64,32)" \ - "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)"; + "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" \ + "$(if $(part-of-module),1,0)" "$(@)"; endif define rule_cc_o_c diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index fe831412bea9..282485a22bf8 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -100,14 +100,14 @@ $P =~ s@.*/@@g; my $V = '0.1'; -if ($#ARGV < 6) { - print "usage: $P arch objdump objcopy cc ld nm rm mv inputfile\n"; +if ($#ARGV < 7) { + print "usage: $P arch objdump objcopy cc ld nm rm mv is_module inputfile\n"; print "version: $V\n"; exit(1); } my ($arch, $bits, $objdump, $objcopy, $cc, - $ld, $nm, $rm, $mv, $inputfile) = @ARGV; + $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV; # Acceptable sections to record. my %text_sections = ( -- cgit v1.2.3-59-g8ed1b From 25aac9dc7c8c73798c1be8aa36141f980d32579e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:40 +0800 Subject: ftrace, ia64: explictly ignore a file in recordmcount.pl In IA64, a function pointer isn't a 'unsigned long' but a 'struct {unsigned long ip, unsigned long gp}'. MCOUNT_ADDR is determined at link time not compile time, so explictly ignore kernel/trace/ftrace.o in recordmcount.pl. Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 10 +--------- scripts/recordmcount.pl | 5 +++++ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e54a6ccdb93..76bb884b6e16 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -263,14 +263,6 @@ static void ftrace_update_pid_func(void) # error Dynamic ftrace depends on MCOUNT_RECORD #endif -/* - * Since MCOUNT_ADDR may point to mcount itself, we do not want - * to get it confused by reading a reference in the code as we - * are parsing on objcopy output of text. Use a variable for - * it instead. - */ -static unsigned long mcount_addr = MCOUNT_ADDR; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -575,7 +567,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ip = rec->ip; - ret = ftrace_make_nop(mod, rec, mcount_addr); + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { ftrace_bug(ret, ip); rec->flags |= FTRACE_FL_FAILED; diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 282485a22bf8..070042b4ccd5 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -109,6 +109,11 @@ if ($#ARGV < 7) { my ($arch, $bits, $objdump, $objcopy, $cc, $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV; +# This file refers to mcount and shouldn't be ftraced, so lets' ignore it +if ($inputfile eq "kernel/trace/ftrace.o") { + exit(0); +} + # Acceptable sections to record. my %text_sections = ( ".text" => 1, -- cgit v1.2.3-59-g8ed1b From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:42 +0800 Subject: ftrace, ia64: Add macro for ftrace_caller Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a 'struct {unsigned long ip, unsigned long gp}'. Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 4 ++++ kernel/trace/ftrace.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 677432b9cb7e..054721487574 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); + +#ifndef FTRACE_ADDR +#define FTRACE_ADDR ((unsigned long)ftrace_caller) +#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern void ftrace_graph_caller(void); extern int ftrace_enable_ftrace_graph_caller(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76bb884b6e16..9f536108d3f3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; - ftrace_addr = (unsigned long)ftrace_caller; + ftrace_addr = (unsigned long)FTRACE_ADDR; ip = rec->ip; -- cgit v1.2.3-59-g8ed1b From 418071eb6adbfd3980b2f57f7df8e03921e3f1d7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:44 +0800 Subject: ftrace, ia64: Add recordmcount for ia64 Add recordmcount for ia64. Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- scripts/recordmcount.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 070042b4ccd5..2ded5c8e6b85 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -206,6 +206,13 @@ if ($arch eq "x86_64") { $alignment = 2; $section_type = '%progbits'; +} elsif ($arch eq "ia64") { + $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; + $type = "data8"; + + if ($is_module eq "0") { + $cc .= " -mconstant-gp"; + } } else { die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; } -- cgit v1.2.3-59-g8ed1b From d3e75ff14bc1453c4762428395aac9953a023efc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:46 +0800 Subject: ftrace, ia64: IA64 static ftrace support IA64 ftrace suppport. In IA64, below code will be added in each function if -pg is enabled. alloc r40=ar.pfs,12,8,0 mov r43=r0;; mov r42=b0 mov r41=r1 nop.i 0x0 br.call.sptk.many b0 = _mcount;; Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/ftrace.h | 15 +++++++++++++ arch/ia64/kernel/entry.S | 49 ++++++++++++++++++++++++++++++++++++++++++ arch/ia64/kernel/ia64_ksyms.c | 6 ++++++ 4 files changed, 71 insertions(+) create mode 100644 arch/ia64/include/asm/ftrace.h diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 3d31636cbafb..b992ba447c41 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,6 +21,7 @@ config IA64 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM select HAVE_ARCH_TRACEHOOK diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h new file mode 100644 index 000000000000..48694b3ba5a8 --- /dev/null +++ b/arch/ia64/include/asm/ftrace.h @@ -0,0 +1,15 @@ +#ifndef _ASM_IA64_FTRACE_H +#define _ASM_IA64_FTRACE_H + +#ifdef CONFIG_FUNCTION_TRACER +#define MCOUNT_INSN_SIZE 32 /* sizeof mcount call */ + +#ifndef __ASSEMBLY__ +extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); +#define mcount _mcount + +#endif + +#endif /* CONFIG_FUNCTION_TRACER */ + +#endif /* _ASM_IA64_FTRACE_H */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index d435f4a7a96c..c2f7d798e2a5 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -47,6 +47,7 @@ #include #include #include +#include #include "minstate.h" @@ -1404,6 +1405,54 @@ GLOBAL_ENTRY(unw_init_running) br.ret.sptk.many rp END(unw_init_running) +#ifdef CONFIG_FUNCTION_TRACER +GLOBAL_ENTRY(_mcount) + movl r2 = ftrace_stub + movl r3 = ftrace_trace_function;; + ld8 r3 = [r3];; + ld8 r3 = [r3];; + cmp.eq p7,p0 = r2, r3 +(p7) br.sptk.many ftrace_stub + ;; + + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(_mcount) + +GLOBAL_ENTRY(ftrace_stub) + mov r3 = b0 + movl r2 = _mcount_ret_helper + ;; + mov b6 = r2 + mov b7 = r3 + br.ret.sptk.many b6 + +_mcount_ret_helper: + mov b0 = r42 + mov r1 = r41 + mov ar.pfs = r40 + br b7 +END(ftrace_stub) + +#endif /* CONFIG_FUNCTION_TRACER */ + .rodata .align 8 .globl sys_call_table diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c index 6da1f20d7372..2d311864e359 100644 --- a/arch/ia64/kernel/ia64_ksyms.c +++ b/arch/ia64/kernel/ia64_ksyms.c @@ -112,3 +112,9 @@ EXPORT_SYMBOL_GPL(esi_call_phys); #endif extern char ia64_ivt[]; EXPORT_SYMBOL(ia64_ivt); + +#include +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(_mcount); +#endif -- cgit v1.2.3-59-g8ed1b From a14a07b8018b714e03a39ff2180c66e307ef4238 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 9 Jan 2009 11:29:49 +0800 Subject: ftrace, ia64: IA64 dynamic ftrace support IA64 dynamic ftrace support. The original _mcount stub for each function is like: alloc r40=ar.pfs,12,8,0 mov r43=r0;; mov r42=b0 mov r41=r1 nop.i 0x0 br.call.sptk.many b0 = _mcount;; The patch convert it to below for nop: [MII] nop.m 0x0 mov r3=ip nop.i 0x0 [MLX] nop.m 0x0 nop.x 0x0;; This isn't completely nop, as there is one instuction 'mov r3=ip', but it should be light and harmless for code follow it. And below is for call [MII] nop.m 0x0 mov r3=ip nop.i 0x0 [MLX] nop.m 0x0 brl.many .;; In this way, only one instruction is changed to convert code between nop and call. This should meet dyn-ftrace's requirement. But this requires CPU support brl instruction, so dyn-ftrace isn't supported for old Itanium system. Assume there are quite few such old system running. Signed-off-by: Shaohua Li Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 2 + arch/ia64/include/asm/ftrace.h | 13 +++ arch/ia64/kernel/Makefile | 5 + arch/ia64/kernel/entry.S | 51 ++++++++++ arch/ia64/kernel/ftrace.c | 206 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 277 insertions(+) create mode 100644 arch/ia64/kernel/ftrace.c diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index b992ba447c41..e20c1d45930a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,6 +21,8 @@ config IA64 select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_DYNAMIC_FTRACE if (!ITANIUM) select HAVE_FUNCTION_TRACER select HAVE_DMA_ATTRS select HAVE_KVM diff --git a/arch/ia64/include/asm/ftrace.h b/arch/ia64/include/asm/ftrace.h index 48694b3ba5a8..d20db3c2a656 100644 --- a/arch/ia64/include/asm/ftrace.h +++ b/arch/ia64/include/asm/ftrace.h @@ -8,6 +8,19 @@ extern void _mcount(unsigned long pfs, unsigned long r1, unsigned long b0, unsigned long r0); #define mcount _mcount +#include +/* In IA64, MCOUNT_ADDR is set in link time, so it's not a constant at compile time */ +#define MCOUNT_ADDR (((struct fnptr *)mcount)->ip) +#define FTRACE_ADDR (((struct fnptr *)ftrace_caller)->ip) + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* second bundle, insn 2 */ + return addr - 0x12; +} + +struct dyn_arch_ftrace { +}; #endif #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index c381ea954892..ab6e7ec0bba3 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -2,6 +2,10 @@ # Makefile for the linux kernel. # +ifdef CONFIG_DYNAMIC_FTRACE +CFLAGS_REMOVE_ftrace.o = -pg +endif + extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ @@ -28,6 +32,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index c2f7d798e2a5..e0be92a6abb0 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1406,6 +1406,56 @@ GLOBAL_ENTRY(unw_init_running) END(unw_init_running) #ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +GLOBAL_ENTRY(_mcount) + br ftrace_stub +END(_mcount) + +.here: + br.ret.sptk.many b0 + +GLOBAL_ENTRY(ftrace_caller) + alloc out0 = ar.pfs, 8, 0, 4, 0 + mov out3 = r0 + ;; + mov out2 = b0 + add r3 = 0x20, r3 + mov out1 = r1; + br.call.sptk.many b0 = ftrace_patch_gp + //this might be called from module, so we must patch gp +ftrace_patch_gp: + movl gp=__gp + mov b0 = r3 + ;; +.global ftrace_call; +ftrace_call: +{ + .mlx + nop.m 0x0 + movl r3 = .here;; +} + alloc loc0 = ar.pfs, 4, 4, 2, 0 + ;; + mov loc1 = b0 + mov out0 = b0 + mov loc2 = r8 + mov loc3 = r15 + ;; + adds out0 = -MCOUNT_INSN_SIZE, out0 + mov out1 = in2 + mov b6 = r3 + + br.call.sptk.many b0 = b6 + ;; + mov ar.pfs = loc0 + mov b0 = loc1 + mov r8 = loc2 + mov r15 = loc3 + br ftrace_stub + ;; +END(ftrace_caller) + +#else GLOBAL_ENTRY(_mcount) movl r2 = ftrace_stub movl r3 = ftrace_trace_function;; @@ -1435,6 +1485,7 @@ GLOBAL_ENTRY(_mcount) br ftrace_stub ;; END(_mcount) +#endif GLOBAL_ENTRY(ftrace_stub) mov r3 = b0 diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c new file mode 100644 index 000000000000..7fc8c961b1f7 --- /dev/null +++ b/arch/ia64/kernel/ftrace.c @@ -0,0 +1,206 @@ +/* + * Dynamic function tracing support. + * + * Copyright (C) 2008 Shaohua Li + * + * For licencing details, see COPYING. + * + * Defines low-level handling of mcount calls when the kernel + * is compiled with the -pg flag. When using dynamic ftrace, the + * mcount call-sites get patched lazily with NOP till they are + * enabled. All code mutation routines here take effect atomically. + */ + +#include +#include + +#include +#include + +/* In IA64, each function will be added below two bundles with -pg option */ +static unsigned char __attribute__((aligned(8))) +ftrace_orig_code[MCOUNT_INSN_SIZE] = { + 0x02, 0x40, 0x31, 0x10, 0x80, 0x05, /* alloc r40=ar.pfs,12,8,0 */ + 0xb0, 0x02, 0x00, 0x00, 0x42, 0x40, /* mov r43=r0;; */ + 0x05, 0x00, 0xc4, 0x00, /* mov r42=b0 */ + 0x11, 0x48, 0x01, 0x02, 0x00, 0x21, /* mov r41=r1 */ + 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* nop.i 0x0 */ + 0x08, 0x00, 0x00, 0x50 /* br.call.sptk.many b0 = _mcount;; */ +}; + +struct ftrace_orig_insn { + u64 dummy1, dummy2, dummy3; + u64 dummy4:64-41+13; + u64 imm20:20; + u64 dummy5:3; + u64 sign:1; + u64 dummy6:4; +}; + +/* mcount stub will be converted below for nop */ +static unsigned char ftrace_nop_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* nop.x 0x0;; */ + 0x00, 0x00, 0x04, 0x00 +}; + +static unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop_code; +} + +/* + * mcount stub will be converted below for call + * Note: Just the last instruction is changed against nop + * */ +static unsigned char __attribute__((aligned(8))) +ftrace_call_code[MCOUNT_INSN_SIZE] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MII] nop.m 0x0 */ + 0x30, 0x00, 0x00, 0x60, 0x00, 0x00, /* mov r3=ip */ + 0x00, 0x00, 0x04, 0x00, /* nop.i 0x0 */ + 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, /* [MLX] nop.m 0x0 */ + 0xff, 0xff, 0xff, 0xff, 0x7f, 0x00, /* brl.many .;;*/ + 0xf8, 0xff, 0xff, 0xc8 +}; + +struct ftrace_call_insn { + u64 dummy1, dummy2; + u64 dummy3:48; + u64 imm39_l:16; + u64 imm39_h:23; + u64 dummy4:13; + u64 imm20:20; + u64 dummy5:3; + u64 i:1; + u64 dummy6:4; +}; + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + struct ftrace_call_insn *code = (void *)ftrace_call_code; + unsigned long offset = addr - (ip + 0x10); + + code->imm39_l = offset >> 24; + code->imm39_h = offset >> 40; + code->imm20 = offset >> 4; + code->i = offset >> 63; + return ftrace_call_code; +} + +static int +ftrace_modify_code(unsigned long ip, unsigned char *old_code, + unsigned char *new_code, int do_check) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + if (!do_check) + goto skip_check; + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + +skip_check: + /* replace the text with the new text */ + if (probe_kernel_write(((void *)ip), new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + flush_icache_range(ip, ip + MCOUNT_INSN_SIZE); + + return 0; +} + +static int ftrace_make_nop_check(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned char __attribute__((aligned(8))) replaced[MCOUNT_INSN_SIZE]; + unsigned long ip = rec->ip; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + if (rec->flags & FTRACE_FL_CONVERTED) { + struct ftrace_call_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_call_code; + tmp_call = (void *)replaced; + call_insn->imm39_l = tmp_call->imm39_l; + call_insn->imm39_h = tmp_call->imm39_h; + call_insn->imm20 = tmp_call->imm20; + call_insn->i = tmp_call->i; + if (memcmp(replaced, ftrace_call_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } else { + struct ftrace_orig_insn *call_insn, *tmp_call; + + call_insn = (void *)ftrace_orig_code; + tmp_call = (void *)replaced; + call_insn->sign = tmp_call->sign; + call_insn->imm20 = tmp_call->imm20; + if (memcmp(replaced, ftrace_orig_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + return 0; + } +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + int ret; + char *new; + + ret = ftrace_make_nop_check(rec, addr); + if (ret) + return ret; + new = ftrace_nop_replace(); + return ftrace_modify_code(rec->ip, NULL, new, 0); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned char *old, *new; + + old= ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + return ftrace_modify_code(ip, old, new, 1); +} + +/* in IA64, _mcount can't directly call ftrace_stub. Only jump is ok */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip; + unsigned long addr = ((struct fnptr *)ftrace_call)->ip; + + if (func == ftrace_stub) + return 0; + ip = ((struct fnptr *)func)->ip; + + ia64_patch_imm64(addr + 2, ip); + + flush_icache_range(addr, addr + 16); + return 0; +} + +/* run from kstop_machine */ +int __init ftrace_dyn_arch_init(void *data) +{ + *(unsigned long *)data = 0; + + return 0; +} -- cgit v1.2.3-59-g8ed1b From 002bb86d8d42f18937aef396c3ecd65c7e02e21a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 10 Jan 2009 11:34:13 -0800 Subject: tracing/ftrace: separate events tracing and stats tracing engine Impact: tracing's Api change Currently, the stat tracing depends on the events tracing. When you switch to a new tracer, the stats files of the previous tracer will disappear. But it's more scalable to separate those two engines. This way, we can keep the stat files of one or several tracers when we want, without bothering of multiple tracer stat files or tracer switching. To build/destroys its stats files, a tracer just have to call register_stat_tracer/unregister_stat_tracer everytimes it wants to. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 - kernel/trace/trace.h | 20 ----- kernel/trace/trace_branch.c | 108 ++++++++++++++----------- kernel/trace/trace_stat.c | 191 +++++++++++++++++++------------------------- kernel/trace/trace_stat.h | 31 +++++++ 5 files changed, 172 insertions(+), 180 deletions(-) create mode 100644 kernel/trace/trace_stat.h diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0418fc338b5c..40217fb499ea 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2353,7 +2353,6 @@ static int tracing_set_tracer(char *buf) if (ret) goto out; } - init_tracer_stat(t); trace_branch_enable(tr); out: @@ -3218,7 +3217,6 @@ __init static int tracer_alloc_buffers(void) #else current_trace = &nop_trace; #endif - init_tracer_stat(current_trace); /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b3f9ad1b4d84..79c872100dd5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,24 +334,6 @@ struct tracer_flags { /* Makes more easy to define a tracer opt */ #define TRACER_OPT(s, b) .name = #s, .bit = b -/* - * If you want to provide a stat file (one-shot statistics), fill - * an iterator with stat_start/stat_next and a stat_show callbacks. - * The others callbacks are optional. - */ -struct tracer_stat { - /* The name of your stat file */ - const char *name; - /* Iteration over statistic entries */ - void *(*stat_start)(void); - void *(*stat_next)(void *prev, int idx); - /* Compare two entries for sorting (optional) for stats */ - int (*stat_cmp)(void *p1, void *p2); - /* Print a stat entry */ - int (*stat_show)(struct seq_file *s, void *p); - /* Print the headers of your stat entries */ - int (*stat_headers)(struct seq_file *s); -}; /* * A specific tracer, represented by methods that operate on a trace array: @@ -466,8 +448,6 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); -void init_tracer_stat(struct tracer *trace); - extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_max_latency; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index da5cf3e5581b..ca017e0a9a27 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -16,12 +16,12 @@ #include #include "trace.h" +#include "trace_stat.h" #include "trace_output.h" -static struct tracer branch_trace; - #ifdef CONFIG_BRANCH_TRACER +static struct tracer branch_trace; static int branch_tracing_enabled __read_mostly; static DEFINE_MUTEX(branch_tracing_mutex); @@ -191,6 +191,30 @@ static struct trace_event trace_branch_event = { .binary = trace_nop_print, }; +static struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif /* CONFIG_FTRACE_SELFTEST */ +}; + +__init static int init_branch_tracer(void) +{ + int ret; + + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "branch events\n"); + return 1; + } + return register_tracer(&branch_trace); +} +device_initcall(init_branch_tracer); + #else static inline void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) @@ -305,6 +329,29 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) return 0; } +static struct tracer_stat annotated_branch_stats = { + .name = "branch_annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show +}; + +__init static int init_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&annotated_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "annotated branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(init_annotated_branch_stats); + #ifdef CONFIG_PROFILE_ALL_BRANCHES extern unsigned long __start_branch_profile[]; @@ -339,60 +386,25 @@ all_branch_stat_next(void *v, int idx) return p; } -static struct tracer_stat branch_stats[] = { - {.name = "annotated", - .stat_start = annotated_branch_stat_start, - .stat_next = annotated_branch_stat_next, - .stat_cmp = annotated_branch_stat_cmp, - .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show}, - - {.name = "all", +static struct tracer_stat all_branch_stats = { + .name = "branch_all", .stat_start = all_branch_stat_start, .stat_next = all_branch_stat_next, .stat_headers = all_branch_stat_headers, - .stat_show = branch_stat_show}, - - { } -}; -#else -static struct tracer_stat branch_stats[] = { - {.name = "annotated", - .stat_start = annotated_branch_stat_start, - .stat_next = annotated_branch_stat_next, - .stat_cmp = annotated_branch_stat_cmp, - .stat_headers = annotated_branch_stat_headers, - .stat_show = branch_stat_show}, - - { } + .stat_show = branch_stat_show }; -#endif /* CONFIG_PROFILE_ALL_BRANCHES */ - -static struct tracer branch_trace __read_mostly = +__init static int all_annotated_branch_stats(void) { - .name = "branch", -#ifdef CONFIG_BRANCH_TRACER - .init = branch_trace_init, - .reset = branch_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_branch, -#endif /* CONFIG_FTRACE_SELFTEST */ -#endif - .stats = branch_stats -}; - -__init static int init_branch_trace(void) -{ -#ifdef CONFIG_BRANCH_TRACER int ret; - ret = register_ftrace_event(&trace_branch_event); + + ret = register_stat_tracer(&all_branch_stats); if (!ret) { - printk(KERN_WARNING "Warning: could not register branch events\n"); + printk(KERN_WARNING "Warning: could not register " + "all branches stats\n"); return 1; } -#endif - - return register_tracer(&branch_trace); + return 0; } -device_initcall(init_branch_trace); +fs_initcall(all_annotated_branch_stats); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 1515f9e7adfc..cb29282b9485 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -10,28 +10,32 @@ #include -#include #include +#include "trace_stat.h" #include "trace.h" /* List of stat entries from a tracer */ struct trace_stat_list { - struct list_head list; - void *stat; + struct list_head list; + void *stat; }; /* A stat session is the stats output in one file */ struct tracer_stat_session { - struct tracer_stat *ts; - struct list_head stat_list; - struct mutex stat_mutex; + struct list_head session_list; + struct tracer_stat *ts; + struct list_head stat_list; + struct mutex stat_mutex; + struct dentry *file; }; /* All of the sessions currently in use. Each stat file embeed one session */ -static struct tracer_stat_session **all_stat_sessions; -static int nb_sessions; -static struct dentry *stat_dir, **stat_files; +static LIST_HEAD(all_stat_sessions); +static DEFINE_MUTEX(all_stat_sessions_mutex); + +/* The root directory for all stat files */ +static struct dentry *stat_dir; static void reset_stat_session(struct tracer_stat_session *session) @@ -44,66 +48,77 @@ static void reset_stat_session(struct tracer_stat_session *session) INIT_LIST_HEAD(&session->stat_list); } -/* Called when a tracer is initialized */ -static int init_all_sessions(int nb, struct tracer_stat *ts) +static void destroy_session(struct tracer_stat_session *session) { - int i, j; - struct tracer_stat_session *session; + debugfs_remove(session->file); + reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); +} - nb_sessions = 0; - if (all_stat_sessions) { - for (i = 0; i < nb_sessions; i++) { - session = all_stat_sessions[i]; - reset_stat_session(session); - mutex_destroy(&session->stat_mutex); - kfree(session); - } - } - all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb, - GFP_KERNEL); - if (!all_stat_sessions) - return -ENOMEM; +static int init_stat_file(struct tracer_stat_session *session); - for (i = 0; i < nb; i++) { - session = kmalloc(sizeof(struct tracer_stat_session) * nb, - GFP_KERNEL); - if (!session) - goto free_sessions; +int register_stat_tracer(struct tracer_stat *trace) +{ + struct tracer_stat_session *session, *node, *tmp; + int ret; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; - INIT_LIST_HEAD(&session->stat_list); - mutex_init(&session->stat_mutex); - session->ts = &ts[i]; - all_stat_sessions[i] = session; + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) + return -EINVAL; } - nb_sessions = nb; - return 0; + mutex_unlock(&all_stat_sessions_mutex); + + /* Init the session */ + session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + if (!session) + return -ENOMEM; -free_sessions: + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + INIT_LIST_HEAD(&session->stat_list); + mutex_init(&session->stat_mutex); + session->file = NULL; - for (j = 0; j < i; j++) - kfree(all_stat_sessions[i]); + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + return ret; + } - kfree(all_stat_sessions); - all_stat_sessions = NULL; + /* Register */ + mutex_lock(&all_stat_sessions_mutex); + list_add_tail(&session->session_list, &all_stat_sessions); + mutex_unlock(&all_stat_sessions_mutex); - return -ENOMEM; + return 0; } -static int basic_tracer_stat_checks(struct tracer_stat *ts) +void unregister_stat_tracer(struct tracer_stat *trace) { - int i; + struct tracer_stat_session *node, *tmp; - if (!ts) - return 0; - - for (i = 0; ts[i].name; i++) { - if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show) - return -EBUSY; + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } } - return i; + mutex_unlock(&all_stat_sessions_mutex); } + /* * For tracers that don't provide a stat_cmp callback. * This one will force an immediate insertion on tail of @@ -280,63 +295,7 @@ static const struct file_operations tracing_stat_fops = { .release = tracing_stat_release }; - -static void destroy_trace_stat_files(void) -{ - int i; - - if (stat_files) { - for (i = 0; i < nb_sessions; i++) - debugfs_remove(stat_files[i]); - kfree(stat_files); - stat_files = NULL; - } -} - -static void init_trace_stat_files(void) -{ - int i; - - if (!stat_dir || !nb_sessions) - return; - - stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL); - - if (!stat_files) { - pr_warning("trace stat: not enough memory\n"); - return; - } - - for (i = 0; i < nb_sessions; i++) { - struct tracer_stat_session *session = all_stat_sessions[i]; - stat_files[i] = debugfs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); - if (!stat_files[i]) - pr_warning("cannot create %s entry\n", - session->ts->name); - } -} - -void init_tracer_stat(struct tracer *trace) -{ - int nb = basic_tracer_stat_checks(trace->stats); - - destroy_trace_stat_files(); - - if (nb < 0) { - pr_warning("stat tracing: missing stat callback on %s\n", - trace->name); - return; - } - if (!nb) - return; - - init_all_sessions(nb, trace->stats); - init_trace_stat_files(); -} - -static int __init tracing_stat_init(void) +static int tracing_stat_init(void) { struct dentry *d_tracing; @@ -348,4 +307,16 @@ static int __init tracing_stat_init(void) "'trace_stat' entry\n"); return 0; } -fs_initcall(tracing_stat_init); + +static int init_stat_file(struct tracer_stat_session *session) +{ + if (!stat_dir && tracing_stat_init()) + return -ENODEV; + + session->file = debugfs_create_file(session->ts->name, 0644, + stat_dir, + session, &tracing_stat_fops); + if (!session->file) + return -ENOMEM; + return 0; +} diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h new file mode 100644 index 000000000000..202274cf7f3d --- /dev/null +++ b/kernel/trace/trace_stat.h @@ -0,0 +1,31 @@ +#ifndef __TRACE_STAT_H +#define __TRACE_STAT_H + +#include + +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(void); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for stats sorting */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + +/* + * Destroy or create a stat file + */ +extern int register_stat_tracer(struct tracer_stat *trace); +extern void unregister_stat_tracer(struct tracer_stat *trace); + +#endif /* __TRACE_STAT_H */ -- cgit v1.2.3-59-g8ed1b From e1d8aa9f1dd655a3534b22fcfbecb70cdb125766 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 12 Jan 2009 23:15:46 +0100 Subject: tracing: add a new workqueue tracer Impact: new tracer The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the works inserted and executed since their creation. It can help to evaluate the amount of work each of them have to perform. For example it can help a developer to decide whether he should choose a per cpu workqueue instead of a singlethreaded one. It only traces statistical informations for now but it will probably later provide event tracing too. Such a tracer could help too, and be improved, to help rt priority sorted workqueue development. To have a snapshot of the workqueues state at any time, just do cat /debugfs/tracing/trace_stat/workqueues Ie: 1 125 125 reiserfs/1 1 0 0 scsi_tgtd/1 1 0 0 aio/1 1 0 0 ata/1 1 114 114 kblockd/1 1 0 0 kintegrityd/1 1 2147 2147 events/1 0 0 0 kpsmoused 0 105 105 reiserfs/0 0 0 0 scsi_tgtd/0 0 0 0 aio/0 0 0 0 ata_aux 0 0 0 ata/0 0 0 0 cqueue 0 0 0 kacpi_notify 0 0 0 kacpid 0 149 149 kblockd/0 0 0 0 kintegrityd/0 0 1000 1000 khelper 0 2270 2270 events/0 Changes in V2: _ Drop the static array based on NR_CPU and dynamically allocate the stat array with num_possible_cpus() and other cpu mask facilities.... _ Trace workqueue insertion at a bit lower level (insert_work instead of queue_work) to handle even the workqueue barriers. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/trace/workqueue.h | 25 ++++ kernel/trace/Kconfig | 11 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_workqueue.c | 287 +++++++++++++++++++++++++++++++++++++++++ kernel/workqueue.c | 16 ++- 5 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 include/trace/workqueue.h create mode 100644 kernel/trace/trace_workqueue.c diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h new file mode 100644 index 000000000000..867829df4571 --- /dev/null +++ b/include/trace/workqueue.h @@ -0,0 +1,25 @@ +#ifndef __TRACE_WORKQUEUE_H +#define __TRACE_WORKQUEUE_H + +#include +#include +#include + +DECLARE_TRACE(workqueue_insertion, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +DECLARE_TRACE(workqueue_execution, + TPPROTO(struct task_struct *wq_thread, struct work_struct *work), + TPARGS(wq_thread, work)); + +/* Trace the creation of one workqueue thread on a cpu */ +DECLARE_TRACE(workqueue_creation, + TPPROTO(struct task_struct *wq_thread, int cpu), + TPARGS(wq_thread, cpu)); + +DECLARE_TRACE(workqueue_destruction, + TPPROTO(struct task_struct *wq_thread), + TPARGS(wq_thread)); + +#endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 944239296f13..dde1d46f77e5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -284,6 +284,17 @@ config KMEMTRACE If unsure, say N. +config WORKQUEUE_TRACER + bool "Trace workqueues" + select TRACING + help + The workqueue tracer provides some statistical informations + about each cpu workqueue thread such as the number of the + works inserted and executed since their creation. It can help + to evaluate the amount of work each of them have to perform. + For example it can help a developer to decide whether he should + choose a per cpu workqueue instead of a singlethreaded one. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 05c9182061de..f76d48f3527d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -36,5 +36,6 @@ obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o +obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c new file mode 100644 index 000000000000..f8118d39ca9b --- /dev/null +++ b/kernel/trace/trace_workqueue.c @@ -0,0 +1,287 @@ +/* + * Workqueue statistical tracer. + * + * Copyright (C) 2008 Frederic Weisbecker + * + */ + + +#include +#include +#include "trace_stat.h" +#include "trace.h" + + +/* A cpu workqueue thread */ +struct cpu_workqueue_stats { + struct list_head list; +/* Useful to know if we print the cpu headers */ + bool first_entry; + int cpu; + pid_t pid; +/* Can be inserted from interrupt or user context, need to be atomic */ + atomic_t inserted; +/* + * Don't need to be atomic, works are serialized in a single workqueue thread + * on a single CPU. + */ + unsigned int executed; +}; + +/* List of workqueue threads on one cpu */ +struct workqueue_global_stats { + struct list_head list; + spinlock_t lock; +}; + +/* Don't need a global lock because allocated before the workqueues, and + * never freed. + */ +static struct workqueue_global_stats *all_workqueue_stat; + +/* Insertion of a work */ +static void +probe_workqueue_insertion(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + atomic_inc(&node->inserted); + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Execution of a work */ +static void +probe_workqueue_execution(struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + node->executed++; + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Creation of a cpu workqueue thread */ +static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +{ + struct cpu_workqueue_stats *cws; + unsigned long flags; + + WARN_ON(cpu < 0 || cpu >= num_possible_cpus()); + + /* Workqueues are sometimes created in atomic context */ + cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); + if (!cws) { + pr_warning("trace_workqueue: not enough memory\n"); + return; + } + tracing_record_cmdline(wq_thread); + + INIT_LIST_HEAD(&cws->list); + cws->cpu = cpu; + + cws->pid = wq_thread->pid; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (list_empty(&all_workqueue_stat[cpu].list)) + cws->first_entry = true; + list_add_tail(&cws->list, &all_workqueue_stat[cpu].list); + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); +} + +/* Destruction of a cpu workqueue thread */ +static void probe_workqueue_destruction(struct task_struct *wq_thread) +{ + /* Workqueue only execute on one cpu */ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + list) { + if (node->pid == wq_thread->pid) { + list_del(&node->list); + kfree(node); + goto found; + } + } + + pr_debug("trace_workqueue: don't find workqueue to destroy\n"); +found: + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + +} + +static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) +{ + unsigned long flags; + struct cpu_workqueue_stats *ret = NULL; + + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + + if (!list_empty(&all_workqueue_stat[cpu].list)) + ret = list_entry(all_workqueue_stat[cpu].list.next, + struct cpu_workqueue_stats, list); + + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return ret; +} + +static void *workqueue_stat_start(void) +{ + int cpu; + void *ret = NULL; + + for_each_possible_cpu(cpu) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; +} + +static void *workqueue_stat_next(void *prev, int idx) +{ + struct cpu_workqueue_stats *prev_cws = prev; + int cpu = prev_cws->cpu; + unsigned long flags; + void *ret = NULL; + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) { + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + for (++cpu ; cpu < num_possible_cpus(); cpu++) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; + } + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, + list); +} + +static int workqueue_stat_show(struct seq_file *s, void *p) +{ + struct cpu_workqueue_stats *cws = p; + unsigned long flags; + int cpu = cws->cpu; + + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), + cws->executed, + trace_find_cmdline(cws->pid)); + + spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + if (&cws->list == all_workqueue_stat[cpu].list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + + return 0; +} + +static int workqueue_stat_headers(struct seq_file *s) +{ + seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); + seq_printf(s, "# | | | |\n\n"); + return 0; +} + +struct tracer_stat workqueue_stats __read_mostly = { + .name = "workqueues", + .stat_start = workqueue_stat_start, + .stat_next = workqueue_stat_next, + .stat_show = workqueue_stat_show, + .stat_headers = workqueue_stat_headers +}; + + +int __init stat_workqueue_init(void) +{ + if (register_stat_tracer(&workqueue_stats)) { + pr_warning("Unable to register workqueue stat tracer\n"); + return 1; + } + + return 0; +} +fs_initcall(stat_workqueue_init); + +/* + * Workqueues are created very early, just after pre-smp initcalls. + * So we must register our tracepoints at this stage. + */ +int __init trace_workqueue_early_init(void) +{ + int ret, cpu; + + ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + if (ret) + goto out; + + ret = register_trace_workqueue_execution(probe_workqueue_execution); + if (ret) + goto no_insertion; + + ret = register_trace_workqueue_creation(probe_workqueue_creation); + if (ret) + goto no_execution; + + ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + if (ret) + goto no_creation; + + all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats) + * num_possible_cpus(), GFP_KERNEL); + + if (!all_workqueue_stat) { + pr_warning("trace_workqueue: not enough memory\n"); + goto no_creation; + } + + for_each_possible_cpu(cpu) { + spin_lock_init(&all_workqueue_stat[cpu].lock); + INIT_LIST_HEAD(&all_workqueue_stat[cpu].list); + } + + return 0; + +no_creation: + unregister_trace_workqueue_creation(probe_workqueue_creation); +no_execution: + unregister_trace_workqueue_execution(probe_workqueue_execution); +no_insertion: + unregister_trace_workqueue_insertion(probe_workqueue_insertion); +out: + pr_warning("trace_workqueue: unable to trace workqueues\n"); + + return 1; +} +early_initcall(trace_workqueue_early_init); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f445833ae37..1fc2bc20603f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,6 +33,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } +DEFINE_TRACE(workqueue_insertion); + static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { + trace_workqueue_insertion(cwq->thread, work); + set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the @@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); +DEFINE_TRACE(workqueue_execution); + static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) */ struct lockdep_map lockdep_map = work->lockdep_map; #endif - + trace_workqueue_execution(cwq->thread, work); cwq->current_work = work; list_del_init(cwq->worklist.next); spin_unlock_irq(&cwq->lock); @@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } +DEFINE_TRACE(workqueue_creation); + static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); cwq->thread = p; + trace_workqueue_creation(cwq->thread, cpu); + return 0; } @@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); +DEFINE_TRACE(workqueue_destruction); + static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* @@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) * checks list_empty(), and a "normal" queue_work() can't use * a dead CPU. */ + trace_workqueue_destruction(cwq->thread); kthread_stop(cwq->thread); cwq->thread = NULL; } -- cgit v1.2.3-59-g8ed1b From 32632920a788fb13da35b131b77cc4324c38c1c5 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-Koenig Date: Mon, 12 Jan 2009 23:35:50 +0100 Subject: ftrace, trivial: fix typo "resgister" -> "register" Signed-off-by: Uwe Kleine-Koenig Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9f536108d3f3..8c1c9c0f4775 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1894,7 +1894,7 @@ int register_ftrace_function(struct ftrace_ops *ops) } /** - * unregister_ftrace_function - unresgister a function for profiling. + * unregister_ftrace_function - unregister a function for profiling. * @ops - ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. -- cgit v1.2.3-59-g8ed1b From 428aee1460a75197f0190534b4d610450ee59af1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 12:24:42 -0500 Subject: trace: print ftrace_dump at KERN_EMERG log level Impact: fix to print out ftrace_dump when expected I was debugging a hard race condition to only find out that after I hit the race, my log level was not at level to show KERN_INFO. The time it took to trigger the race was wasted because I did not capture the trace. Since ftrace_dump is only called from kernel oops (and only when it is set in the kernel command line to do so), or when a developer adds it to their own local tree, the log level of the print should be at KERN_EMERG to make sure the print appears. ftrace_dump is not called by a normal user setup, and will not add extra unwanted print out to the console. There is no reason it should be at KERN_INFO. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 40217fb499ea..408c03f2b8a9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3076,7 +3076,7 @@ static struct notifier_block trace_die_notifier = { * it if we decide to change what log level the ftrace dump * should be at. */ -#define KERN_TRACE KERN_INFO +#define KERN_TRACE KERN_EMERG static void trace_printk_seq(struct trace_seq *s) -- cgit v1.2.3-59-g8ed1b From 6f3b34402e7282cde49dff395d7ea462bf33bf50 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 12 Jan 2009 11:06:18 +0800 Subject: ring_buffer: reset write when reserve buffer fail Impact: reset struct buffer_page.write when interrupt storm if struct buffer_page.write is not reset, any succedent committing will corrupted ring_buffer: static inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { ...... cpu_buffer->commit_page->commit = cpu_buffer->commit_page->write; ...... } when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer is disabled, but some reserved buffers may haven't been committed. we need reset struct buffer_page.write. when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer is still available, we should not corrupt it. Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4832ffa5d937..0b9de5a3d699 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1017,12 +1017,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, } if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) { - /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + if (!(buffer->flags & RB_FL_OVERWRITE)) goto out_unlock; - } /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { @@ -1097,6 +1093,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; out_unlock: + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; -- cgit v1.2.3-59-g8ed1b From 0ee6b6cf5bdb793b4c68507dd65adf16341aa4ca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 14 Jan 2009 14:50:19 -0500 Subject: trace: stop all recording to ring buffer on ftrace_dump Impact: limit ftrace dump output Currently ftrace_dump only calls ftrace_kill that is a fast way to prevent the function tracer functions from being called (just sets a flag and clears the function to call, nothing else). It is better to also turn off any recording to the ring buffers as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 408c03f2b8a9..dcb757f70d21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3110,6 +3110,7 @@ void ftrace_dump(void) dump_ran = 1; /* No turning back! */ + tracing_off(); ftrace_kill(); for_each_tracing_cpu(cpu) { -- cgit v1.2.3-59-g8ed1b From 4a2b8dda3f8705880ec7408135645602d5590f51 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 14 Jan 2009 13:33:27 -0800 Subject: tracing/function-graph-tracer: fix a regression while suspend to disk Impact: fix a crash while kernel image restore When the function graph tracer is running and while suspend to disk, some racy and dangerous things happen against this tracer. The current task will save its registers including the stack pointer which contains the return address hooked by the tracer. But the current task will continue to enter other functions after that to save the memory, and then it will store other return addresses, and finally loose the old depth which matches the return address saved in the old stack (during the registers saving). So on image restore, the code will return to wrong addresses. And there are other things: on restore, the task will have it's "current" pointer overwritten during registers restoring....switching from one task to another... That would be insane to try to trace function graphs at these stages. This patch makes the function graph tracer listening on power events, making it's tracing disabled for the current task (the one that performs the hibernation work) while suspend/resume to disk, making the tracing safe during hibernation. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c1c9c0f4775..7e9a20b69939 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1957,6 +1958,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_graph_active; +static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { @@ -2035,6 +2037,27 @@ static int start_graph_tracing(void) return ret; } +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -2042,6 +2065,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_sysctl_lock); + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; + register_pm_notifier(&ftrace_suspend_notifier); + atomic_inc(&ftrace_graph_active); ret = start_graph_tracing(); if (ret) { @@ -2067,6 +2093,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); mutex_unlock(&ftrace_sysctl_lock); } -- cgit v1.2.3-59-g8ed1b From 42fab4b2cdc02cf28e2474ccfd75bc9225076590 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 09:30:52 +0800 Subject: tracing/ftrace: add missing unlock in register_stat_tracer() We should unlock all_stat_sessions_mutex before returning failure. Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index cb29282b9485..2110cea2ece3 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -73,8 +73,10 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { - if (node->ts == trace) + if (node->ts == trace) { + mutex_unlock(&all_stat_sessions_mutex); return -EINVAL; + } } mutex_unlock(&all_stat_sessions_mutex); -- cgit v1.2.3-59-g8ed1b From 55922173f1f63903b6de03711ab8ff980cbe58d2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Jan 2009 11:31:21 +0100 Subject: tracing: trace_stat.c cleanup Impact: cleanup - whitespace / code alignment cleanups - avoid unnecessary forward prototype by reordering functions Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 147 ++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 76 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 2110cea2ece3..eae9cef39291 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -17,16 +17,16 @@ /* List of stat entries from a tracer */ struct trace_stat_list { - struct list_head list; - void *stat; + struct list_head list; + void *stat; }; /* A stat session is the stats output in one file */ struct tracer_stat_session { struct list_head session_list; - struct tracer_stat *ts; - struct list_head stat_list; - struct mutex stat_mutex; + struct tracer_stat *ts; + struct list_head stat_list; + struct mutex stat_mutex; struct dentry *file; }; @@ -35,7 +35,7 @@ static LIST_HEAD(all_stat_sessions); static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ -static struct dentry *stat_dir; +static struct dentry *stat_dir; static void reset_stat_session(struct tracer_stat_session *session) @@ -56,71 +56,6 @@ static void destroy_session(struct tracer_stat_session *session) kfree(session); } - -static int init_stat_file(struct tracer_stat_session *session); - -int register_stat_tracer(struct tracer_stat *trace) -{ - struct tracer_stat_session *session, *node, *tmp; - int ret; - - if (!trace) - return -EINVAL; - - if (!trace->stat_start || !trace->stat_next || !trace->stat_show) - return -EINVAL; - - /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { - if (node->ts == trace) { - mutex_unlock(&all_stat_sessions_mutex); - return -EINVAL; - } - } - mutex_unlock(&all_stat_sessions_mutex); - - /* Init the session */ - session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); - if (!session) - return -ENOMEM; - - session->ts = trace; - INIT_LIST_HEAD(&session->session_list); - INIT_LIST_HEAD(&session->stat_list); - mutex_init(&session->stat_mutex); - session->file = NULL; - - ret = init_stat_file(session); - if (ret) { - destroy_session(session); - return ret; - } - - /* Register */ - mutex_lock(&all_stat_sessions_mutex); - list_add_tail(&session->session_list, &all_stat_sessions); - mutex_unlock(&all_stat_sessions_mutex); - - return 0; -} - -void unregister_stat_tracer(struct tracer_stat *trace) -{ - struct tracer_stat_session *node, *tmp; - - mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { - if (node->ts == trace) { - list_del(&node->session_list); - destroy_session(node); - break; - } - } - mutex_unlock(&all_stat_sessions_mutex); -} - - /* * For tracers that don't provide a stat_cmp callback. * This one will force an immediate insertion on tail of @@ -252,10 +187,10 @@ static int stat_seq_show(struct seq_file *s, void *v) } static const struct seq_operations trace_stat_seq_ops = { - .start = stat_seq_start, - .next = stat_seq_next, - .stop = stat_seq_stop, - .show = stat_seq_show + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show }; /* The session stat is refilled and resorted at each stat file opening */ @@ -275,7 +210,6 @@ static int tracing_stat_open(struct inode *inode, struct file *file) return ret; } - /* * Avoid consuming memory with our now useless list. */ @@ -322,3 +256,64 @@ static int init_stat_file(struct tracer_stat_session *session) return -ENOMEM; return 0; } + +int register_stat_tracer(struct tracer_stat *trace) +{ + struct tracer_stat_session *session, *node, *tmp; + int ret; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; + + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + mutex_unlock(&all_stat_sessions_mutex); + return -EINVAL; + } + } + mutex_unlock(&all_stat_sessions_mutex); + + /* Init the session */ + session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + INIT_LIST_HEAD(&session->stat_list); + mutex_init(&session->stat_mutex); + session->file = NULL; + + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + return ret; + } + + /* Register */ + mutex_lock(&all_stat_sessions_mutex); + list_add_tail(&session->session_list, &all_stat_sessions); + mutex_unlock(&all_stat_sessions_mutex); + + return 0; +} + +void unregister_stat_tracer(struct tracer_stat *trace) +{ + struct tracer_stat_session *node, *tmp; + + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } + } + mutex_unlock(&all_stat_sessions_mutex); +} -- cgit v1.2.3-59-g8ed1b From 9d87c3192d96ef9ac1cec8321538e9b35e90b5aa Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 14 Jan 2009 23:22:07 -0600 Subject: [XFS] Remove the rest of the macro-to-function indirections. Remove the last of the macros-defined-to-static-functions. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/quota/xfs_dquot.c | 2 +- fs/xfs/xfs_ag.h | 6 +- fs/xfs/xfs_alloc_btree.c | 2 +- fs/xfs/xfs_attr.c | 26 ++++---- fs/xfs/xfs_bmap.c | 166 +++++++++++++++++++++++----------------------- fs/xfs/xfs_bmap.h | 2 - fs/xfs/xfs_bmap_btree.c | 10 +-- fs/xfs/xfs_bmap_btree.h | 4 -- fs/xfs/xfs_btree.c | 6 +- fs/xfs/xfs_da_btree.c | 8 +-- fs/xfs/xfs_ialloc.c | 6 +- fs/xfs/xfs_ialloc.h | 2 - fs/xfs/xfs_ialloc_btree.h | 1 - fs/xfs/xfs_inode.c | 6 +- fs/xfs/xfs_inode_item.h | 4 -- fs/xfs/xfs_iomap.c | 10 +-- fs/xfs/xfs_itable.c | 6 +- fs/xfs/xfs_mount.h | 6 +- fs/xfs/xfs_rename.c | 2 +- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_rw.h | 1 - fs/xfs/xfs_sb.h | 2 +- fs/xfs/xfs_vnodeops.c | 20 +++--- 23 files changed, 142 insertions(+), 158 deletions(-) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 591ca6602bfb..d68b4e1cf1d1 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -421,7 +421,7 @@ xfs_qm_dqalloc( /* * Initialize the bmap freelist prior to calling bmapi code. */ - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); /* * Return if this type of quotas is turned off while we didn't diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index d3b3cf742999..143d63ecb20a 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -244,8 +244,8 @@ typedef struct xfs_perag #define XFS_AG_CHECK_DADDR(mp,d,len) \ ((len) == 1 ? \ ASSERT((d) == XFS_SB_DADDR || \ - XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ - ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ - XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) #endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 733cb75a8c5d..c10c3a292d30 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -115,7 +115,7 @@ xfs_allocbt_free_block( xfs_agblock_t bno; int error; - bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index f7cdc28aff41..5fde1654b430 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - XFS_BMAP_INIT(args.flist, args.firstblock); + xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) { error = xfs_bmap_finish(&args.trans, args.flist, @@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -1290,7 +1290,7 @@ restart: * have been a b-tree. */ xfs_da_state_free(state); - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1331,7 +1331,7 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1443,7 +1443,7 @@ restart: * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, @@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_da_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, @@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) == XFS_ATTR_LEAF_MAGIC); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { @@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Allocate a single extent, up to the size of the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, @@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, @@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) /* * Try to remember where we decided to put the value. */ - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, args->rmtblkcnt, @@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) blkcnt = args->rmtblkcnt; done = 0; while (!done) { - XFS_BMAP_INIT(args->flist, args->firstblock); + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, 1, args->firstblock, args->flist, diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 138308e70d14..c852cd65aaea 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -595,9 +595,9 @@ xfs_bmap_add_extent( xfs_iext_insert(ifp, 0, 1, new); ASSERT(cur == NULL); ifp->if_lastex = 0; - if (!ISNULLSTARTBLOCK(new->br_startblock)) { + if (!isnullstartblock(new->br_startblock)) { XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else logflags = 0; /* DELTA: single new extent */ @@ -613,7 +613,7 @@ xfs_bmap_add_extent( /* * Any kind of new delayed allocation goes here. */ - else if (ISNULLSTARTBLOCK(new->br_startblock)) { + else if (isnullstartblock(new->br_startblock)) { if (cur) ASSERT((cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL) == 0); @@ -644,11 +644,11 @@ xfs_bmap_add_extent( * in a delayed or unwritten allocation with a real one, or * converting real back to unwritten. */ - if (!ISNULLSTARTBLOCK(new->br_startblock) && + if (!isnullstartblock(new->br_startblock) && new->br_startoff + new->br_blockcount > prev.br_startoff) { if (prev.br_state != XFS_EXT_UNWRITTEN && - ISNULLSTARTBLOCK(prev.br_startblock)) { - da_old = STARTBLOCKVAL(prev.br_startblock); + isnullstartblock(prev.br_startblock)) { + da_old = startblockval(prev.br_startblock); if (cur) ASSERT(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL); @@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); } STATE_SET(LEFT_CONTIG, STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && @@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); } STATE_SET(RIGHT_CONTIG, STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && @@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ @@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - + startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx + 1); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ @@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: The boundary between two in-core extents moved. */ @@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real( goto done; } temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - + startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK); *dnew = temp; /* DELTA: One in-core extent is split in two. */ @@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(ip, temp); temp2 = xfs_bmap_worst_indlen(ip, temp2); - diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (cur ? cur->bc_private.b.allocated : 0)); if (diff > 0 && xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) { @@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real( } } ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK); XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), - NULLSTARTBLOCK((int)temp2)); + nullstartblock((int)temp2)); XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK); *dnew = temp + temp2; /* DELTA: One in-core extent is split in three. */ @@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock)); } STATE_SET(LEFT_CONTIG, STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && @@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock)); } STATE_SET(RIGHT_CONTIG, STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && @@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ep = xfs_iext_get_ext(ifp, idx); state = 0; - ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); + ASSERT(isnullstartblock(new->br_startblock)); /* * Check and set flags if this segment has a left neighbor */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); } /* * Check and set flags if the current (right) segment exists. @@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay( idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); } /* * Set contiguity flags on the left and right neighbors. @@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay( XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); + nullstartblock((int)newlen)); XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1, XFS_DATA_FORK); XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK); @@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay( XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); + nullstartblock((int)newlen)); XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, XFS_DATA_FORK); ip->i_df.if_lastex = idx - 1; @@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay( */ XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK); temp = new->br_blockcount + right.br_blockcount; - oldlen = STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); xfs_bmbt_set_allf(ep, new->br_startoff, - NULLSTARTBLOCK((int)newlen), temp, right.br_state); + nullstartblock((int)newlen), temp, right.br_state); XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK); ip->i_df.if_lastex = idx; /* DELTA: One in-core extent grew into a hole. */ @@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real( */ if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock)); } /* * Check and set flags if this segment has a current value. @@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real( idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock)); } /* * We're inserting a real allocation between "left" and "right". @@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real( XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork); ifp->if_lastex = idx - 1; if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + rval = xfs_ilog_fext(whichfork); } else { rval = 0; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real( XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork); ifp->if_lastex = idx; if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + rval = xfs_ilog_fext(whichfork); } else { rval = 0; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real( XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; if ((error = xfs_bmbt_lookup_eq(cur, @@ -2482,7 +2482,7 @@ xfs_bmap_adjacent( * try to use it's last block as our starting point. */ if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + !isnullstartblock(ap->prevp->br_startblock) && ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, ap->prevp->br_startblock)) { ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; @@ -2511,7 +2511,7 @@ xfs_bmap_adjacent( * start block based on it. */ if (ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && + !isnullstartblock(ap->prevp->br_startblock) && (prevbno = ap->prevp->br_startblock + ap->prevp->br_blockcount) && ISVALID(prevbno, ap->prevp->br_startblock)) { @@ -2552,7 +2552,7 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->gotp->br_startblock)) { /* * Calculate gap to start of next block. */ @@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -3136,8 +3136,8 @@ xfs_bmap_del_extent( del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); - delay = ISNULLSTARTBLOCK(got.br_startblock); - ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); flags = 0; qfield = 0; error = 0; @@ -3189,7 +3189,7 @@ xfs_bmap_del_extent( } da_old = da_new = 0; } else { - da_old = STARTBLOCKVAL(got.br_startblock); + da_old = startblockval(got.br_startblock); da_new = 0; nblks = 0; do_fx = 0; @@ -3213,7 +3213,7 @@ xfs_bmap_del_extent( XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) @@ -3233,7 +3233,7 @@ xfs_bmap_del_extent( if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); da_new = temp; @@ -3242,7 +3242,7 @@ xfs_bmap_del_extent( xfs_bmbt_set_startblock(ep, del_endblock); XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, @@ -3262,7 +3262,7 @@ xfs_bmap_del_extent( if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); da_new = temp; @@ -3270,7 +3270,7 @@ xfs_bmap_del_extent( } XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, got.br_startoff, @@ -3345,22 +3345,22 @@ xfs_bmap_del_extent( } XFS_WANT_CORRUPTED_GOTO(i == 1, done); } else - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { ASSERT(whichfork == XFS_DATA_FORK); temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = NULLSTARTBLOCK((int)temp2); + new.br_startblock = nullstartblock((int)temp2); da_new = temp + temp2; while (da_new > da_old) { if (temp) { temp--; da_new--; xfs_bmbt_set_startblock(ep, - NULLSTARTBLOCK((int)temp)); + nullstartblock((int)temp)); } if (da_new == da_old) break; @@ -3368,7 +3368,7 @@ xfs_bmap_del_extent( temp2--; da_new--; new.br_startblock = - NULLSTARTBLOCK((int)temp2); + nullstartblock((int)temp2); } } } @@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree( nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); - if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { arp->l0 = cpu_to_be64(ep->l0); arp->l1 = cpu_to_be64(ep->l1); arp++; cnt++; @@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree( xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); ASSERT(*curp == NULL); *curp = cur; - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; } @@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents( ip->i_d.di_nblocks = 1; XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); } else { ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); @@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork( XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); switch (ip->i_d.di_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, @@ -4162,7 +4162,7 @@ xfs_bmap_add_free( ASSERT(bno != NULLFSBLOCK); ASSERT(len > 0); ASSERT(len <= MAXEXTLEN); - ASSERT(!ISNULLSTARTBLOCK(bno)); + ASSERT(!isnullstartblock(bno)); agno = XFS_FSB_TO_AGNO(mp, bno); agbno = XFS_FSB_TO_AGBNO(mp, bno); ASSERT(agno < mp->m_sb.sb_agcount); @@ -4909,7 +4909,7 @@ xfs_bmapi( got.br_startoff = end; inhole = eof || got.br_startoff > bno; wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - ISNULLSTARTBLOCK(got.br_startblock); + isnullstartblock(got.br_startblock); /* * First, deal with the hole before the allocated space * that we found, if any. @@ -5028,7 +5028,7 @@ xfs_bmapi( } ip->i_delayed_blks += alen; - abno = NULLSTARTBLOCK(indlen); + abno = nullstartblock(indlen); } else { /* * If first time, allocate and fill in @@ -5144,8 +5144,8 @@ xfs_bmapi( aoff + alen); #ifdef DEBUG if (flags & XFS_BMAPI_DELAY) { - ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); - ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); + ASSERT(isnullstartblock(got.br_startblock)); + ASSERT(startblockval(got.br_startblock) > 0); } ASSERT(got.br_state == XFS_EXT_NORM || got.br_state == XFS_EXT_UNWRITTEN); @@ -5179,7 +5179,7 @@ xfs_bmapi( ASSERT((bno >= obno) || (n == 0)); ASSERT(bno < end); mval->br_startoff = bno; - if (ISNULLSTARTBLOCK(got.br_startblock)) { + if (isnullstartblock(got.br_startblock)) { ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); mval->br_startblock = DELAYSTARTBLOCK; } else @@ -5201,7 +5201,7 @@ xfs_bmapi( ASSERT(mval->br_blockcount <= len); } else { *mval = got; - if (ISNULLSTARTBLOCK(mval->br_startblock)) { + if (isnullstartblock(mval->br_startblock)) { ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); mval->br_startblock = DELAYSTARTBLOCK; } @@ -5329,12 +5329,12 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && + if ((logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); + logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log whatever the flags say, even if error. Otherwise we might miss * detecting a case where the data is changed, there's an error, @@ -5411,7 +5411,7 @@ xfs_bmapi_single( *fsb = NULLFSBLOCK; return 0; } - ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); + ASSERT(!isnullstartblock(got.br_startblock)); ASSERT(bno < got.br_startoff + got.br_blockcount); *fsb = got.br_startblock + (bno - got.br_startoff); ifp->if_lastex = lastx; @@ -5543,7 +5543,7 @@ xfs_bunmapi( */ ASSERT(ep != NULL); del = got; - wasdel = ISNULLSTARTBLOCK(del.br_startblock); + wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5638,7 +5638,7 @@ xfs_bunmapi( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), &prev); ASSERT(prev.br_state == XFS_EXT_NORM); - ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); + ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); if (prev.br_startoff < start) { @@ -5666,7 +5666,7 @@ xfs_bunmapi( } } if (wasdel) { - ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); + ASSERT(startblockval(del.br_startblock) > 0); /* Update realtime/data freespace, unreserve quota */ if (isrt) { xfs_filblks_t rtexts; @@ -5782,12 +5782,12 @@ error0: * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && + if ((logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); + logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction * is dirty we'll need to shut down the filesystem. @@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole( if (startblock == DELAYSTARTBLOCK) out->bmv_block = -2; else - out->bmv_block = XFS_FSB_TO_DB(ip, startblock); + out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && @@ -5979,7 +5979,7 @@ xfs_getbmap( if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | + bmapi_flags = xfs_bmapi_aflag(whichfork) | ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE); /* @@ -6098,7 +6098,7 @@ xfs_bmap_isaeof( */ *aeof = (off >= s.br_startoff && off < s.br_startoff + s.br_blockcount && - ISNULLSTARTBLOCK(s.br_startblock)) || + isnullstartblock(s.br_startblock)) || off >= s.br_startoff + s.br_blockcount; return 0; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 284571c05ed0..be2979d88d32 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -95,7 +95,6 @@ typedef struct xfs_bmap_free /* need write cache flushing and no */ /* additional allocation alignments */ -#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); @@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w) #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) -#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index ba6b08c2fb02..0760d352586f 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -121,7 +121,7 @@ __xfs_bmbt_get_all( b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); s->br_startblock = (xfs_fsblock_t)b; } #else /* !DEBUG */ @@ -172,7 +172,7 @@ xfs_bmbt_get_startblock( b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)r->l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); return (xfs_fsblock_t)b; #else /* !DEBUG */ return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); @@ -261,7 +261,7 @@ xfs_bmbt_set_allf( ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(startblock)) { + if (isnullstartblock(startblock)) { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | ((xfs_bmbt_rec_base_t)startoff << 9) | (xfs_bmbt_rec_base_t)xfs_mask64lo(9); @@ -321,7 +321,7 @@ xfs_bmbt_disk_set_allf( ((xfs_bmbt_rec_base_t)blockcount & (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(startblock)) { + if (isnullstartblock(startblock)) { r->l0 = cpu_to_be64( ((xfs_bmbt_rec_base_t)extent_flag << 63) | ((xfs_bmbt_rec_base_t)startoff << 9) | @@ -382,7 +382,7 @@ xfs_bmbt_set_startblock( r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | (xfs_bmbt_rec_base_t)(v << 21); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(v)) { + if (isnullstartblock(v)) { r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | ((xfs_bmbt_rec_base_t)v << 21) | diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index a4555abb6622..0e8df007615e 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host { #define DSTARTBLOCKMASK \ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define ISNULLSTARTBLOCK(x) isnullstartblock(x) static inline int isnullstartblock(xfs_fsblock_t x) { return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; } -#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x) static inline int isnulldstartblock(xfs_dfsbno_t x) { return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; } -#define NULLSTARTBLOCK(k) nullstartblock(k) static inline xfs_fsblock_t nullstartblock(int k) { ASSERT(k < (1 << STARTBLOCKVALBITS)); return STARTBLOCKMASK | (k); } -#define STARTBLOCKVAL(x) startblockval(x) static inline xfs_filblks_t startblockval(xfs_fsblock_t x) { return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 2c3ef20f8842..4681519ded91 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr( ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp))); else { - ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp, + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp))); } } @@ -2454,7 +2454,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); *stat = 1; XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; @@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); cur->bc_nlevels--; out0: XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index a11a8390bf6c..c45f74ff1a5b 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) nmap = 1; ASSERT(args->firstblock != NULL); if ((error = xfs_bmapi(tp, dp, bno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, args->flist, NULL))) { @@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) nmap = MIN(XFS_BMAP_MAX_NMAP, count); c = (int)(bno + count - b); if ((error = xfs_bmapi(tp, dp, b, c, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| + xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE| XFS_BMAPI_METADATA, args->firstblock, args->total, &mapp[mapi], &nmap, args->flist, @@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, * the last block to the place we want to kill. */ if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, NULL, &done)) == ENOSPC) { if (w != XFS_DATA_FORK) @@ -1987,7 +1987,7 @@ xfs_da_do_buf( if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, nfsb, XFS_BMAPI_METADATA | - XFS_BMAPI_AFLAG(whichfork), + xfs_bmapi_aflag(whichfork), NULL, 0, mapp, &nmap, NULL, NULL))) goto exit0; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index e6ebbaeb4dc6..ab016e5ae7be 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc( int ioffset = i << args.mp->m_sb.sb_inodelog; uint isize = sizeof(struct xfs_dinode); - free = XFS_MAKE_IPTR(args.mp, fbuf, i); + free = xfs_make_iptr(args.mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); @@ -937,7 +937,7 @@ nextag: } } } - offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); + offset = xfs_ialloc_find_free(&rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1279,7 +1279,7 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno); + cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno); offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock; imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 50f558a4e0a8..aeee8278f92c 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -39,7 +39,6 @@ struct xfs_trans; /* * Make an inode pointer out of the buffer/offset. */ -#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o) static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { @@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) /* * Find a free (set) bit in the inode bitmask. */ -#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp) static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) { return xfs_lowbit64(*fp); diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 37e5dd01a577..5580e255ff06 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -36,7 +36,6 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) -#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return (((n) >= XFS_INODES_PER_CHUNK ? \ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5a5e035e5d38..be2ca4d67b53 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1601,10 +1601,10 @@ xfs_itruncate_finish( * in this file with garbage in them once recovery * runs. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_bunmapi(ntp, ip, first_unmap_block, unmap_len, - XFS_BMAPI_AFLAG(fork) | + xfs_bmapi_aflag(fork) | (sync ? 0 : XFS_BMAPI_ASYNC), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, @@ -2557,7 +2557,7 @@ xfs_iextents_copy( for (i = 0; i < nrecs; i++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); start_block = xfs_bmbt_get_startblock(ep); - if (ISNULLSTARTBLOCK(start_block)) { + if (isnullstartblock(start_block)) { /* * It's a delayed allocation extent, so skip it. */ diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1ff04cc323ad..9957d0602d54 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) - -#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) static inline int xfs_ilog_fbroot(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); } -#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) static inline int xfs_ilog_fext(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); } -#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) static inline int xfs_ilog_fdata(int w) { return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 911062cf73a6..08ce72316bfe 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -155,7 +155,7 @@ xfs_imap_to_bmap( iomapp->iomap_bn = IOMAP_DADDR_NULL; iomapp->iomap_flags |= IOMAP_DELAY; } else { - iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block); + iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block); if (ISUNWRITTEN(imap)) iomapp->iomap_flags |= IOMAP_UNWRITTEN; } @@ -261,7 +261,7 @@ xfs_iomap( xfs_iunlock(ip, lockmode); lockmode = 0; - if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { + if (nimaps && !isnullstartblock(imap.br_startblock)) { xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip, offset, count, iomapp, &imap, flags); break; @@ -491,7 +491,7 @@ xfs_iomap_write_direct( /* * Issue the xfs_bmapi() call to allocate the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list, NULL); @@ -751,7 +751,7 @@ xfs_iomap_write_allocate( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); /* * it is possible that the extents have changed since @@ -911,7 +911,7 @@ xfs_iomap_write_unwritten( /* * Modify the unwritten extent state of the buffer. */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index e19d0a8d5618..cf98a805ec90 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -453,7 +453,7 @@ xfs_bulkstat( (chunkidx = agino - gino + 1) < XFS_INODES_PER_CHUNK && /* there are some left allocated */ - XFS_INOBT_MASKN(chunkidx, + xfs_inobt_maskn(chunkidx, XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { /* * Grab the chunk record. Mark all the @@ -464,7 +464,7 @@ xfs_bulkstat( if (XFS_INOBT_MASK(i) & ~gfree) gcnt++; } - gfree |= XFS_INOBT_MASKN(0, chunkidx); + gfree |= xfs_inobt_maskn(0, chunkidx); irbp->ir_startino = gino; irbp->ir_freecount = gcnt; irbp->ir_free = gfree; @@ -535,7 +535,7 @@ xfs_bulkstat( chunkidx < XFS_INODES_PER_CHUNK; chunkidx += nicluster, agbno += nbcluster) { - if (XFS_INOBT_MASKN(chunkidx, + if (xfs_inobt_maskn(chunkidx, nicluster) & ~gfree) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c1e028467327..9bb41a9f765d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations { #ifndef __KERNEL__ -#define XFS_DADDR_TO_AGNO(mp,d) \ +#define xfs_daddr_to_agno(mp,d) \ ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) -#define XFS_DADDR_TO_AGBNO(mp,d) \ +#define xfs_daddr_to_agbno(mp,d) \ ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) #else /* __KERNEL__ */ @@ -439,7 +439,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ -#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { @@ -448,7 +447,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agnumber_t) ld; } -#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 86471bb40fd4..58f85e9cd11d 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -147,7 +147,7 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, inodes, &num_inodes); - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index edf12c7b834c..c5bb86f3ec05 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -120,7 +120,7 @@ xfs_growfs_rt_alloc( if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) goto error_cancel; - XFS_BMAP_INIT(&flist, &firstblock); + xfs_bmap_init(&flist, &firstblock); /* * Allocate blocks to the bitmap file. */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f87db5344ce6..f76c003ec55d 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -28,7 +28,6 @@ struct xfs_mount; * file is a real time file or not, because the bmap code * does. */ -#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb) static inline xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) { diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 1ed71916e4c9..1b017c657494 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ - XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) #define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index f07bf8768c3a..0e55c5d7db5f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt( * Find the block(s) so we can inval and unmap them. */ done = 0; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); nmaps = ARRAY_SIZE(mval); if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, @@ -1288,7 +1288,7 @@ xfs_inactive( /* * Free the inode. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_ifree(tp, ip, &free_list); if (error) { /* @@ -1461,7 +1461,7 @@ xfs_create( xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = B_TRUE; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); ASSERT(ip == NULL); @@ -1879,7 +1879,7 @@ xfs_remove( } } - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); if (error) { @@ -2059,7 +2059,7 @@ xfs_link( if (error) goto error_return; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); @@ -2231,7 +2231,7 @@ xfs_mkdir( xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = B_FALSE; - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, &first_block, &free_list, resblks ? @@ -2438,7 +2438,7 @@ xfs_symlink( * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); /* * Allocate an inode for the symlink. @@ -2860,7 +2860,7 @@ retry: /* * Issue the xfs_bmapi() call to allocate the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); error = xfs_bmapi(tp, ip, startoffset_fsb, allocatesize_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, @@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes( XFS_BUF_UNDONE(bp); XFS_BUF_UNWRITE(bp); XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); xfsbdstrat(mp, bp); error = xfs_iowait(bp); if (error) { @@ -3186,7 +3186,7 @@ xfs_free_file_space( /* * issue the bunmapi() call to free the blocks */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, NULL, &done); -- cgit v1.2.3-59-g8ed1b From 6c1a99afbda99cd8d8c69d756387041567a13d87 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 15 Jan 2009 18:05:40 +0800 Subject: ftrace: fix trace_output Impact: fix bug for handling partial line trace_seq_printf(), seq_print_userip_objs(), ... return 0 -- partial line was written other(>0) -- success duplicate output is also removed in trace_print_raw(). Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 65 +++++++++++++++++++++------------------------ kernel/trace/trace_output.h | 4 +-- 2 files changed, 33 insertions(+), 36 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df0c25cbed30..4e3ad36b117c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -440,9 +440,9 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags) trace_assign_type(field, entry); - if (trace_seq_printf(s, "%x %x\n", - field->ip, - field->parent_ip)) + if (!trace_seq_printf(s, "%x %x\n", + field->ip, + field->parent_ip)) return TRACE_TYPE_PARTIAL_LINE; return 0; @@ -497,14 +497,14 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags, T = task_state_char(field->next_state); S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); - if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, delim, - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm)) + if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm)) return TRACE_TYPE_PARTIAL_LINE; return 0; @@ -534,14 +534,14 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags, if (!S) task_state_char(field->prev_state); T = task_state_char(field->next_state); - if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T)) + if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T)) return TRACE_TYPE_PARTIAL_LINE; return 0; @@ -639,10 +639,10 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags) trace_assign_type(field, entry); - if (trace_seq_printf(s, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) + if (!trace_seq_printf(s, "# %ld %ld %ld\n", + field->arg1, + field->arg2, + field->arg3)) return TRACE_TYPE_PARTIAL_LINE; return 0; @@ -697,13 +697,13 @@ trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags) for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { - if (trace_seq_puts(s, " <= ")) + if (!trace_seq_puts(s, " <= ")) goto partial; - if (seq_print_ip_sym(s, field->caller[i], flags)) + if (!seq_print_ip_sym(s, field->caller[i], flags)) goto partial; } - if (trace_seq_puts(s, "\n")) + if (!trace_seq_puts(s, "\n")) goto partial; } @@ -731,10 +731,10 @@ trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry, trace_assign_type(field, entry); - if (seq_print_userip_objs(field, s, flags)) + if (!seq_print_userip_objs(field, s, flags)) goto partial; - if (trace_seq_putc(s, '\n')) + if (!trace_seq_putc(s, '\n')) goto partial; return 0; @@ -760,10 +760,10 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) trace_assign_type(field, entry); - if (seq_print_ip_sym(s, field->ip, flags)) + if (!seq_print_ip_sym(s, field->ip, flags)) goto partial; - if (trace_seq_printf(s, ": %s", field->buf)) + if (!trace_seq_printf(s, ": %s", field->buf)) goto partial; return 0; @@ -779,10 +779,7 @@ trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags) trace_assign_type(field, entry); - if (seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf)) + if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf)) goto partial; return 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index ecab4ea4a4fd..b2c14615e0cd 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -45,14 +45,14 @@ trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags); #define SEQ_PUT_FIELD_RET(s, x) \ do { \ if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return 0; \ + return TRACE_TYPE_PARTIAL_LINE; \ } while (0) #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return 0; \ + return TRACE_TYPE_PARTIAL_LINE; \ } while (0) #endif -- cgit v1.2.3-59-g8ed1b From 5361499101306cfb776c3cfa0f69d0479bc63868 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 19:12:40 -0500 Subject: ftrace: add stack trace to function tracer Impact: new feature to stack trace any function Chris Mason asked about being able to pick and choose a function and get a stack trace from it. This feature enables his request. # echo io_schedule > /debug/tracing/set_ftrace_filter # echo function > /debug/tracing/current_tracer # echo func_stack_trace > /debug/tracing/trace_options Produces the following in /debug/tracing/trace: kjournald-702 [001] 135.673060: io_schedule <-sync_buffer kjournald-702 [002] 135.673671: <= sync_buffer <= __wait_on_bit <= out_of_line_wait_on_bit <= __wait_on_buffer <= sync_dirty_buffer <= journal_commit_transaction <= kjournald Note, be careful about turning this on without filtering the functions. You may find that you have a 10 second lag between typing and seeing what you typed. This is why the stack trace for the function tracer does not use the same stack_trace flag as the other tracers use. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 26 ++++++++----- kernel/trace/trace.h | 7 ++++ kernel/trace/trace_functions.c | 84 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcb757f70d21..3c54cb125228 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -835,10 +835,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, trace_function(tr, data, ip, parent_ip, flags, pc); } -static void ftrace_trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags, - int skip, int pc) +static void __ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) { #ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; @@ -846,9 +846,6 @@ static void ftrace_trace_stack(struct trace_array *tr, struct stack_trace trace; unsigned long irq_flags; - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags); if (!event) @@ -869,12 +866,23 @@ static void ftrace_trace_stack(struct trace_array *tr, #endif } +static void ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(tr, data, flags, skip, pc); +} + void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, - int skip) + int skip, int pc) { - ftrace_trace_stack(tr, data, flags, skip, preempt_count()); + __ftrace_trace_stack(tr, data, flags, skip, pc); } static void ftrace_trace_userstack(struct trace_array *tr, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79c872100dd5..bf39a369e4b3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -457,6 +457,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); +void __trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc); + extern cycle_t ftrace_now(int cpu); #ifdef CONFIG_FUNCTION_TRACER @@ -467,6 +472,8 @@ void tracing_stop_function_trace(void); # define tracing_stop_function_trace() do { } while (0) #endif +extern int ftrace_function_enabled; + #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 9236d7e25a16..3a5fa08cedb0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,6 +16,8 @@ #include "trace.h" +static struct trace_array *func_trace; + static void start_function_trace(struct trace_array *tr) { tr->cpu = get_cpu(); @@ -34,6 +36,7 @@ static void stop_function_trace(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { + func_trace = tr; start_function_trace(tr); return 0; } @@ -48,12 +51,93 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + /* + * skip over 5 funcs: + * __ftrace_trace_stack, + * __trace_stack, + * function_stack_trace_call + * ftrace_list_func + * ftrace_call + */ + __trace_stack(tr, data, flags, 5, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static struct ftrace_ops trace_stack_ops __read_mostly = +{ + .func = function_stack_trace_call, +}; + +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static struct tracer_opt func_opts[] = { +#ifdef CONFIG_STACKTRACE + { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags func_flags = { + .val = 0, /* By default: all flags disabled */ + .opts = func_opts +}; + +static int func_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_FUNC_OPT_STACK) { + /* do nothing if already set */ + if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) + return 0; + + if (set) + register_ftrace_function(&trace_stack_ops); + else + unregister_ftrace_function(&trace_stack_ops); + + return 0; + } + + return -EINVAL; +} + static struct tracer function_trace __read_mostly = { .name = "function", .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, + .flags = &func_flags, + .set_flag = func_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif -- cgit v1.2.3-59-g8ed1b From bb3c3c95f330f7bf16e33b002e48882616089db1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 20:40:23 -0500 Subject: ftrace: move function tracer functions out of trace.c Impact: clean up of trace.c The function tracer functions were put in trace.c because it needed to share static variables that were in trace.c. Since then, those variables have become global for various reasons. This patch moves the function tracer functions into trace_function.c where they belong. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 84 ------------------------------------------ kernel/trace/trace_functions.c | 84 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3c54cb125228..2585ffb6c6b5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1046,65 +1046,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_TRACER -static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu, resched; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - pc = preempt_count(); - resched = ftrace_preempt_disable(); - local_save_flags(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags, pc); - - atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); -} - -static void -function_trace_call(unsigned long ip, unsigned long parent_ip) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - long disabled; - int cpu; - int pc; - - if (unlikely(!ftrace_function_enabled)) - return; - - /* - * Need to use raw, since this must be called before the - * recursive protection is performed. - */ - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - trace_function(tr, data, ip, parent_ip, flags, pc); - } - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - #ifdef CONFIG_FUNCTION_GRAPH_TRACER int trace_graph_entry(struct ftrace_graph_ent *trace) { @@ -1162,31 +1103,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -static struct ftrace_ops trace_ops __read_mostly = -{ - .func = function_trace_call, -}; - -void tracing_start_function_trace(void) -{ - ftrace_function_enabled = 0; - - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - - register_ftrace_function(&trace_ops); - ftrace_function_enabled = 1; -} - -void tracing_stop_function_trace(void) -{ - ftrace_function_enabled = 0; - unregister_ftrace_function(&trace_ops); -} -#endif - enum trace_file_type { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3a5fa08cedb0..2dce3c7370d1 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -20,6 +20,7 @@ static struct trace_array *func_trace; static void start_function_trace(struct trace_array *tr) { + func_trace = tr; tr->cpu = get_cpu(); tracing_reset_online_cpus(tr); put_cpu(); @@ -36,7 +37,6 @@ static void stop_function_trace(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { - func_trace = tr; start_function_trace(tr); return 0; } @@ -51,6 +51,64 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(tr); } +static void +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, resched; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + local_save_flags(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, data, ip, parent_ip, flags, pc); + + atomic_dec(&data->disabled); + ftrace_preempt_enable(resched); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -90,6 +148,30 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, +}; + +void tracing_start_function_trace(void) +{ + ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + + register_ftrace_function(&trace_ops); + ftrace_function_enabled = 1; +} + +void tracing_stop_function_trace(void) +{ + ftrace_function_enabled = 0; + unregister_ftrace_function(&trace_ops); +} static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, -- cgit v1.2.3-59-g8ed1b From c37abc5515b5ed5b1d2134d2deaead492d9f92a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 20:50:54 -0500 Subject: trace: add gcc printf check to trace_seq_printf Andrew Morton suggested adding a printf checker to trace_seq_printf since there are a number of users that have improper format arguments. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index b2c14615e0cd..1cbab5e3dc99 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,7 +16,8 @@ struct trace_event { trace_print_func binary; }; -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -- cgit v1.2.3-59-g8ed1b From 5e4abc9839191e213965e0f1dbf36e2e44356c3a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 21:00:50 -0500 Subject: trace: clean up format errors in calls to trace_seq_printf After adding the printf format checking for trace_seq_printf, several warnings now show up. This patch cleans them up. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 4 ++-- kernel/trace/trace_mmiotrace.c | 13 +++++++------ kernel/trace/trace_output.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index faaa5ae7e75a..7ebc58cee3bd 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; /* Requested */ - ret = trace_seq_printf(s, "%4d ", entry->bytes_req); + ret = trace_seq_printf(s, "%4ld ", entry->bytes_req); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Allocated */ - ret = trace_seq_printf(s, "%4d ", entry->bytes_alloc); + ret = trace_seq_printf(s, "%4ld ", entry->bytes_alloc); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 621c8c3f3139..ec78e244242e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -184,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, - "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: ret = trace_seq_printf(s, - "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: ret = trace_seq_printf(s, - "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", + "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," + "%02lx 0x%lx %d\n", secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, @@ -230,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, - "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: ret = trace_seq_printf(s, - "UNMAP %lu.%06lu %d 0x%lx %d\n", + "UNMAP %u.%06lu %d 0x%lx %d\n", secs, usec_rem, m->map_id, 0UL, 0); break; default: @@ -261,7 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); + ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4e3ad36b117c..1a4e144a9f8f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -440,7 +440,7 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags) trace_assign_type(field, entry); - if (!trace_seq_printf(s, "%x %x\n", + if (!trace_seq_printf(s, "%lx %lx\n", field->ip, field->parent_ip)) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From 3eb36aa05329a47cbe201c151fd0024a4a3649cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 22:21:43 -0500 Subject: ftrace: combine stack trace in function call Impact: less likely to interleave function and stack traces This patch does replaces the separate stack trace on function with a record function and stack trace together. This will switch between the function only recording to a function and stack recording. Also some whitespace fix ups as well. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions.c | 61 +++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2dce3c7370d1..61d0b73dabf5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -133,6 +133,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) if (likely(disabled == 1)) { pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); /* * skip over 5 funcs: * __ftrace_trace_stack, @@ -154,24 +155,6 @@ static struct ftrace_ops trace_ops __read_mostly = .func = function_trace_call, }; -void tracing_start_function_trace(void) -{ - ftrace_function_enabled = 0; - - if (trace_flags & TRACE_ITER_PREEMPTONLY) - trace_ops.func = function_trace_call_preempt_only; - else - trace_ops.func = function_trace_call; - - register_ftrace_function(&trace_ops); - ftrace_function_enabled = 1; -} - -void tracing_stop_function_trace(void) -{ - ftrace_function_enabled = 0; - unregister_ftrace_function(&trace_ops); -} static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, @@ -194,6 +177,31 @@ static struct tracer_flags func_flags = { .opts = func_opts }; +void tracing_start_function_trace(void) +{ + ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + register_ftrace_function(&trace_stack_ops); + else + register_ftrace_function(&trace_ops); + + ftrace_function_enabled = 1; +} + +void tracing_stop_function_trace(void) +{ + ftrace_function_enabled = 0; + /* OK if they are not registered */ + unregister_ftrace_function(&trace_stack_ops); + unregister_ftrace_function(&trace_ops); +} + static int func_set_flag(u32 old_flags, u32 bit, int set) { if (bit == TRACE_FUNC_OPT_STACK) { @@ -201,10 +209,13 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) return 0; - if (set) + if (set) { + unregister_ftrace_function(&trace_ops); register_ftrace_function(&trace_stack_ops); - else + } else { unregister_ftrace_function(&trace_stack_ops); + register_ftrace_function(&trace_ops); + } return 0; } @@ -214,14 +225,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) static struct tracer function_trace __read_mostly = { - .name = "function", - .init = function_trace_init, - .reset = function_trace_reset, - .start = function_trace_start, + .name = "function", + .init = function_trace_init, + .reset = function_trace_reset, + .start = function_trace_start, .flags = &func_flags, .set_flag = func_set_flag, #ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_function, + .selftest = trace_selftest_startup_function, #endif }; -- cgit v1.2.3-59-g8ed1b From a225cdd263f340c864febb1992802fb5b08bc328 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 23:06:03 -0500 Subject: ftrace: remove static from function tracer functions Impact: clean up After reorganizing the functions in trace.c and trace_function.c, they no longer need to be in global context. This patch makes the functions and one variable into static. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 --- kernel/trace/trace.h | 10 ---------- kernel/trace/trace_functions.c | 10 ++++++++-- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2585ffb6c6b5..7de6a94063dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -187,9 +187,6 @@ int tracing_is_enabled(void) return tracer_enabled; } -/* function tracing enabled */ -int ftrace_function_enabled; - /* * trace_buf_size is the size in bytes that is allocated * for a buffer. Note, the number of bytes is always rounded diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bf39a369e4b3..54b72781e920 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -464,16 +464,6 @@ void __trace_stack(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_FUNCTION_TRACER -void tracing_start_function_trace(void); -void tracing_stop_function_trace(void); -#else -# define tracing_start_function_trace() do { } while (0) -# define tracing_stop_function_trace() do { } while (0) -#endif - -extern int ftrace_function_enabled; - #ifdef CONFIG_CONTEXT_SWITCH_TRACER typedef void (*tracer_switch_func_t)(void *private, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 61d0b73dabf5..b3a320f8aba7 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -16,8 +16,14 @@ #include "trace.h" +/* function tracing enabled */ +static int ftrace_function_enabled; + static struct trace_array *func_trace; +static void tracing_start_function_trace(void); +static void tracing_stop_function_trace(void); + static void start_function_trace(struct trace_array *tr) { func_trace = tr; @@ -177,7 +183,7 @@ static struct tracer_flags func_flags = { .opts = func_opts }; -void tracing_start_function_trace(void) +static void tracing_start_function_trace(void) { ftrace_function_enabled = 0; @@ -194,7 +200,7 @@ void tracing_start_function_trace(void) ftrace_function_enabled = 1; } -void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(void) { ftrace_function_enabled = 0; /* OK if they are not registered */ -- cgit v1.2.3-59-g8ed1b From 745b1626dd71ce9661a05ea4db57859ed5c773d2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2009 23:40:11 -0500 Subject: trace: set max latency variable to zero on default Impact: trace max latencies on start of latency tracing This patch sets the max latency to zero whenever one of the irq variant tracers or the wakeup tracer is set to current tracer. Most developers expect to see output when starting up a latency tracer. But since the max_latency is already set to max, and it takes a latency greater than max_latency to be recorded, there is no trace. This is not the expected behavior and has even confused myself. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- kernel/trace/trace_irqsoff.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7de6a94063dd..220c264e3111 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,7 +41,7 @@ #define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) -unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; +unsigned long __read_mostly tracing_max_latency; unsigned long __read_mostly tracing_thresh; /* diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7c2e326bbc8b..62a78d943534 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 43586b689e31..42ae1e77b6b3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + tracing_max_latency = 0; wakeup_trace = tr; start_wakeup_tracer(tr); return 0; -- cgit v1.2.3-59-g8ed1b From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 15 Jan 2009 11:08:40 -0800 Subject: softlockup: decouple hung tasks check from softlockup detection Decoupling allows: * hung tasks check to happen at very low priority * hung tasks check and softlockup to be enabled/disabled independently at compile and/or run-time * individual panic settings to be enabled disabled independently at compile and/or run-time * softlockup threshold to be reduced without increasing hung tasks poll frequency (hung task check is expensive relative to softlock watchdog) * hung task check to be zero over-head when disabled at run-time Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 14 +++- kernel/Makefile | 1 + kernel/hung_task.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/softlockup.c | 100 ------------------------- kernel/sysctl.c | 15 +++- lib/Kconfig.debug | 38 ++++++++++ 6 files changed, 261 insertions(+), 105 deletions(-) create mode 100644 kernel/hung_task.c diff --git a/include/linux/sched.h b/include/linux/sched.h index 54cbabf3b871..f2f94d532302 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; -extern unsigned long sysctl_hung_task_check_count; -extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; extern int softlockup_thresh; #else static inline void softlockup_tick(void) @@ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void) } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_check_count; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos); +#endif /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -1236,7 +1242,7 @@ struct task_struct { /* ipc stuff */ struct sysv_sem sysvsem; #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ unsigned long last_switch_timestamp; unsigned long last_switch_count; diff --git a/kernel/Makefile b/kernel/Makefile index 2aebc4cd7878..979745f1b4bc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o +obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/hung_task.c b/kernel/hung_task.c new file mode 100644 index 000000000000..ba5a77cad3bb --- /dev/null +++ b/kernel/hung_task.c @@ -0,0 +1,198 @@ +/* + * Detect Hung Task + * + * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Have a reasonable limit on the number of tasks checked: + */ +unsigned long __read_mostly sysctl_hung_task_check_count = 1024; + +/* + * Zero means infinite timeout - no checking done: + */ +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; +static unsigned long __read_mostly hung_task_poll_jiffies; + +unsigned long __read_mostly sysctl_hung_task_warnings = 10; + +static int __read_mostly did_panic; + +static struct task_struct *watchdog_task; + +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * hung task is detected: + */ +unsigned int __read_mostly sysctl_hung_task_panic = + CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + +static int __init hung_task_panic_setup(char *str) +{ + sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("hung_task_panic=", hung_task_panic_setup); + +static int +hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = hung_task_panic, +}; + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + int this_cpu = raw_smp_processor_id(); + + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static void check_hung_task(struct task_struct *t, unsigned long now) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + + if (t->flags & PF_FROZEN) + return; + + if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + t->last_switch_count = switch_count; + t->last_switch_timestamp = now; + return; + } + if ((long)(now - t->last_switch_timestamp) < + sysctl_hung_task_timeout_secs) + return; + if (!sysctl_hung_task_warnings) + return; + sysctl_hung_task_warnings--; + + /* + * Ok, the task did not get scheduled for more than 2 minutes, + * complain: + */ + printk(KERN_ERR "INFO: task %s:%d blocked for more than " + "%ld seconds.\n", t->comm, t->pid, + sysctl_hung_task_timeout_secs); + printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + __debug_show_held_locks(t); + + t->last_switch_timestamp = now; + touch_nmi_watchdog(); + + if (sysctl_hung_task_panic) + panic("hung_task: blocked tasks"); +} + +/* + * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for + * a really long time (120 seconds). If that happens, print out + * a warning. + */ +static void check_hung_uninterruptible_tasks(void) +{ + int max_count = sysctl_hung_task_check_count; + unsigned long now = get_timestamp(); + struct task_struct *g, *t; + + /* + * If the system crashed already then all bets are off, + * do not report extra hung tasks: + */ + if (test_taint(TAINT_DIE) || did_panic) + return; + + read_lock(&tasklist_lock); + do_each_thread(g, t) { + if (!--max_count) + goto unlock; + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) + check_hung_task(t, now); + } while_each_thread(g, t); + unlock: + read_unlock(&tasklist_lock); +} + +static void update_poll_jiffies(void) +{ + /* timeout of 0 will disable the watchdog */ + if (sysctl_hung_task_timeout_secs == 0) + hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; + else + hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; +} + +/* + * Process updating of timeout sysctl + */ +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + goto out; + + update_poll_jiffies(); + + wake_up_process(watchdog_task); + + out: + return ret; +} + +/* + * kthread which checks for tasks stuck in D state + */ +static int watchdog(void *dummy) +{ + set_user_nice(current, 0); + update_poll_jiffies(); + + for ( ; ; ) { + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + check_hung_uninterruptible_tasks(); + } + + return 0; +} + +static int __init hung_task_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + + return 0; +} + +module_init(hung_task_init); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 85d5a2455103..88796c330838 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -165,98 +165,12 @@ void softlockup_tick(void) panic("softlockup: hung tasks"); } -/* - * Have a reasonable limit on the number of tasks checked: - */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; - -/* - * Zero means infinite timeout - no checking done: - */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; - -unsigned long __read_mostly sysctl_hung_task_warnings = 10; - -/* - * Only do the hung-tasks check on one CPU: - */ -static int check_cpu __read_mostly = -1; - -static void check_hung_task(struct task_struct *t, unsigned long now) -{ - unsigned long switch_count = t->nvcsw + t->nivcsw; - - if (t->flags & PF_FROZEN) - return; - - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { - t->last_switch_count = switch_count; - t->last_switch_timestamp = now; - return; - } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) - return; - if (!sysctl_hung_task_warnings) - return; - sysctl_hung_task_warnings--; - - /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: - */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - __debug_show_held_locks(t); - - t->last_switch_timestamp = now; - touch_nmi_watchdog(); - - if (softlockup_panic) - panic("softlockup: blocked tasks"); -} - -/* - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. - */ -static void check_hung_uninterruptible_tasks(int this_cpu) -{ - int max_count = sysctl_hung_task_check_count; - unsigned long now = get_timestamp(this_cpu); - struct task_struct *g, *t; - - /* - * If the system crashed already then all bets are off, - * do not report extra hung tasks: - */ - if (test_taint(TAINT_DIE) || did_panic) - return; - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (!--max_count) - goto unlock; - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); - } while_each_thread(g, t); - unlock: - read_unlock(&tasklist_lock); -} - /* * The watchdog thread - runs every second and touches the timestamp. */ static int watchdog(void *__bind_cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - int this_cpu = (long)__bind_cpu; sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu) if (kthread_should_stop()) break; - if (this_cpu == check_cpu) { - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); - } - set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - if (hotcpu == check_cpu) { - /* Pick any other online cpu. */ - check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); - } - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: if (!per_cpu(watchdog_task, hotcpu)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 596dc31a7116..2481ed30d2b5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, .extra2 = &sixty, }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", @@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dohung_task_timeout_secs, .strategy = &sysctl_intvec, }, { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4c9ae6085c75..883ecea22f37 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC default 1 if BOOTPARAM_SOFTLOCKUP_PANIC +config DETECT_HUNG_TASK + bool "Detect Hung Tasks" + depends on DEBUG_KERNEL + default y + help + Say Y here to enable the kernel to detect "hung tasks", + which are bugs that cause the task to be stuck in + uninterruptible "D" state indefinitiley. + + When a hung task is detected, the kernel will print the + current stack trace (which you should report), but the + task will stay in uninterruptible state. If lockdep is + enabled then all held locks will also be reported. This + feature has negligible overhead. + +config BOOTPARAM_HUNG_TASK_PANIC + bool "Panic (Reboot) On Hung Tasks" + depends on DETECT_HUNG_TASK + help + Say Y here to enable the kernel to panic on "hung tasks", + which are bugs that cause the kernel to leave a task stuck + in uninterruptible "D" state. + + The panic can be used in combination with panic_timeout, + to cause the system to reboot automatically after a + hung task has been detected. This feature is useful for + high-availability systems that have uptime guarantees and + where a hung tasks must be resolved ASAP. + + Say N if unsure. + +config BOOTPARAM_HUNG_TASK_PANIC_VALUE + int + depends on DETECT_HUNG_TASK + range 0 1 + default 0 if !BOOTPARAM_HUNG_TASK_PANIC + default 1 if BOOTPARAM_HUNG_TASK_PANIC + config SCHED_DEBUG bool "Collect scheduler debugging info" depends on DEBUG_KERNEL && PROC_FS -- cgit v1.2.3-59-g8ed1b From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 16 Jan 2009 09:09:38 -0800 Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK Fixes the following compile error: kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count' kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp' Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd8..fb9444282836 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_DETECT_HUNG_TASK p->last_switch_count = 0; p->last_switch_timestamp = 0; #endif -- cgit v1.2.3-59-g8ed1b From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sat, 17 Jan 2009 10:31:48 -0800 Subject: softlockup: fix potential race in hung_task when resetting timeout Impact: fix potential false panic A potential race exists if sysctl_hung_task_timeout_secs is reset to 0 while inside check_hung_uniterruptible_tasks(). If check_task() is entered, a comparison with 0 will result in a false hung_task being detected. If sysctl_hung_task_panic is set, the system will panic. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba5a77cad3bb..ba8ccd432963 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,8 @@ static unsigned long get_timestamp(void) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static void check_hung_task(struct task_struct *t, unsigned long now) +static void check_hung_task(struct task_struct *t, unsigned long now, + unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; @@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) + if ((long)(now - t->last_switch_timestamp) < timeout) return; if (!sysctl_hung_task_warnings) return; @@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * complain: */ printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); + "%ld seconds.\n", t->comm, t->pid, timeout); printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); @@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now) * a really long time (120 seconds). If that happens, print out * a warning. */ -static void check_hung_uninterruptible_tasks(void) +static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long now = get_timestamp(); @@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void) goto unlock; /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); + check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: read_unlock(&tasklist_lock); @@ -180,8 +179,17 @@ static int watchdog(void *dummy) update_poll_jiffies(); for ( ; ; ) { + unsigned long timeout; + while (schedule_timeout_interruptible(hung_task_poll_jiffies)); - check_hung_uninterruptible_tasks(); + + /* + * Need to cache timeout here to avoid timeout being set + * to 0 via sysctl while inside check_hung_*_tasks(). + */ + timeout = sysctl_hung_task_timeout_secs; + if (timeout) + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3-59-g8ed1b From b43f70933e7753a284733d5ae355f6778bd118ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Jan 2009 23:18:31 -0500 Subject: ftrace: test for running of recordmcount.pl twice on an object Impact: fix failure of dynamic function tracer selftest In a course of development, a developer does several makes on their kernel. Sometimes, the make might do something abnormal. In the case of running the recordmcount.pl script on an object twice, the script will duplicate all the calls to mcount in the __mcount_loc section. On boot up, the dynamic function tracer is careful when it modifies code, and performs several consistency checks. One is to not modify the call site if it is not what it expects it to be. If a function call site is listed twice, the first entry will convert the site to a nop, and the second will fail because it expected to see a call to mcount, but instead it sees a nop. Thus, the function tracer is disabled. Eric Sesterhenn reported seeing: [ 1.055440] ftrace: converting mcount calls to 0f 1f 44 00 00 [ 1.055568] ftrace: allocating 29418 entries in 116 pages [ 1.061000] ------------[ cut here ]------------ [ 1.061000] WARNING: at kernel/trace/ftrace.c:441 [...] [ 1.060000] ---[ end trace 4eaa2a86a8e2da23 ]--- [ 1.060000] ftrace failed to modify [] check_corruption+0x3/0x2d [ 1.060000] actual: 0f:1f:44:00:00 This warning shows that check_corruption+0x3 already had a nop in its place (0x0f1f440000). After compiling another kernel the problem went away. Later Eric Paris notice the same type of issue. Luckily, he saved the vmlinux file that caused it. In the file we found a bunch of duplicate mcount call site records, which lead us to the script. Perhaps this problem only happens to people named Eric. This patch changes the script to test if the __mcount_loc already exists in the object file, and if it does, it will print out an error message and kill the compile. Reported-by: Eric Sesterhenn Reported-by: Eric Paris Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- scripts/recordmcount.pl | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 2ded5c8e6b85..409596eca124 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -101,7 +101,7 @@ $P =~ s@.*/@@g; my $V = '0.1'; if ($#ARGV < 7) { - print "usage: $P arch objdump objcopy cc ld nm rm mv is_module inputfile\n"; + print "usage: $P arch bits objdump objcopy cc ld nm rm mv is_module inputfile\n"; print "version: $V\n"; exit(1); } @@ -275,7 +275,6 @@ if (!$found_version) { "\tDisabling local function references.\n"; } - # # Step 1: find all the local (static functions) and weak symbols. # 't' is local, 'w/W' is weak (we never use a weak function) @@ -343,13 +342,16 @@ sub update_funcs # # Step 2: find the sections and mcount call sites # -open(IN, "$objdump -dr $inputfile|") || die "error running $objdump"; +open(IN, "$objdump -hdr $inputfile|") || die "error running $objdump"; my $text; +my $read_headers = 1; + while () { # is it a section? if (/$section_regex/) { + $read_headers = 0; # Only record text sections that we know are safe if (defined($text_sections{$1})) { @@ -383,6 +385,19 @@ while () { $ref_func = $text; } } + } elsif ($read_headers && /$mcount_section/) { + # + # Somehow the make process can execute this script on an + # object twice. If it does, we would duplicate the mcount + # section and it will cause the function tracer self test + # to fail. Check if the mcount section exists, and if it does, + # warn and exit. + # + print STDERR "ERROR: $mcount_section already in $inputfile\n" . + "\tThis may be an indication that your build is corrupted.\n" . + "\tDelete $inputfile and try again. If the same object file\n" . + "\tstill causes an issue, then disable CONFIG_DYNAMIC_FTRACE.\n"; + exit(-1); } # is this a call site to mcount? If so, record it to print later -- cgit v1.2.3-59-g8ed1b From d296d30a9948e895bff005d92c38309e8bd30975 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:02:57 +0100 Subject: xfs: fix dentry aliasing issues in open_by_handle Open by handle just grabs an inode by handle and then creates itself a dentry for it. While this works for regular files it is horribly broken for directories, where the VFS locking relies on the fact that there is only just one single dentry for a given inode, and that these are always connected to the root of the filesystem so that it's locking algorithms work (see Documentations/filesystems/Locking) Remove all the existing open by handle code and replace it with a small wrapper around the exportfs code which deals with all these issues. At the same time we also make the checks for a valid handle strict enough to reject all not perfectly well formed handles - given that we never hand out others that's okay and simplifies the code. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/Kconfig | 1 + fs/xfs/linux-2.6/xfs_ioctl.c | 305 ++++++++++++++++++----------------------- fs/xfs/linux-2.6/xfs_ioctl.h | 15 +- fs/xfs/linux-2.6/xfs_ioctl32.c | 175 +++++++---------------- 4 files changed, 196 insertions(+), 300 deletions(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 3f53dd101f99..29228f5899cd 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,6 +1,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK + select EXPORTFS help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index e5be1e0be802..4bd112313f33 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -50,12 +50,14 @@ #include "xfs_vnodeops.h" #include "xfs_quota.h" #include "xfs_inode_item.h" +#include "xfs_export.h" #include #include #include #include #include +#include /* * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to @@ -164,97 +166,69 @@ xfs_find_handle( return 0; } - /* - * Convert userspace handle data into inode. - * - * We use the fact that all the fsop_handlereq ioctl calls have a data - * structure argument whose first component is always a xfs_fsop_handlereq_t, - * so we can pass that sub structure into this handy, shared routine. - * - * If no error, caller must always iput the returned inode. + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. */ STATIC int -xfs_vget_fsop_handlereq( - xfs_mount_t *mp, - struct inode *parinode, /* parent inode pointer */ - xfs_fsop_handlereq_t *hreq, - struct inode **inode) +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) { - void __user *hanp; - size_t hlen; - xfs_fid_t *xfid; - xfs_handle_t *handlep; xfs_handle_t handle; - xfs_inode_t *ip; - xfs_ino_t ino; - __u32 igen; - int error; + struct xfs_fid64 fid; /* * Only allow handle opens under a directory. */ - if (!S_ISDIR(parinode->i_mode)) - return XFS_ERROR(ENOTDIR); - - hanp = hreq->ihandle; - hlen = hreq->ihandlen; - handlep = &handle; - - if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) - return XFS_ERROR(EINVAL); - if (copy_from_user(handlep, hanp, hlen)) - return XFS_ERROR(EFAULT); - if (hlen < sizeof(*handlep)) - memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); - if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.fid_len)) || - handlep->ha_fid.fid_pad) - return XFS_ERROR(EINVAL); - } - - /* - * Crack the handle, obtain the inode # & generation # - */ - xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { - ino = xfid->fid_ino; - igen = xfid->fid_gen; - } else { - return XFS_ERROR(EINVAL); - } - - /* - * Get the XFS inode, building a Linux inode to go with it. - */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) - return error; - if (ip == NULL) - return XFS_ERROR(EIO); - if (ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOENT); - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} - *inode = VFS_I(ip); - return 0; +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); } int xfs_open_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, struct file *parfilp, - struct inode *parinode) + xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); int error; - int new_fd; + int fd; int permflag; struct file *filp; struct inode *inode; @@ -263,19 +237,21 @@ xfs_open_by_handle( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = dentry->d_inode; /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - iput(inode); - return -XFS_ERROR(EINVAL); + error = -XFS_ERROR(EPERM); + goto out_dput; } #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; #endif + /* Put open permission in namei format. */ permflag = hreq->oflags; if ((permflag+1) & O_ACCMODE) @@ -285,50 +261,45 @@ xfs_open_by_handle( if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && (permflag & FMODE_WRITE) && IS_APPEND(inode)) { - iput(inode); - return -XFS_ERROR(EPERM); + error = -XFS_ERROR(EPERM); + goto out_dput; } if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - iput(inode); - return -XFS_ERROR(EACCES); + error = -XFS_ERROR(EACCES); + goto out_dput; } /* Can't write directories. */ - if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { - iput(inode); - return -XFS_ERROR(EISDIR); + if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + error = -XFS_ERROR(EISDIR); + goto out_dput; } - if ((new_fd = get_unused_fd()) < 0) { - iput(inode); - return new_fd; + fd = get_unused_fd(); + if (fd < 0) { + error = fd; + goto out_dput; } - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - put_unused_fd(new_fd); - return PTR_ERR(dentry); - } - - /* Ensure umount returns EBUSY on umounts while this file is open. */ - mntget(parfilp->f_path.mnt); - - /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred); + filp = dentry_open(dentry, mntget(parfilp->f_path.mnt), + hreq->oflags, cred); if (IS_ERR(filp)) { - put_unused_fd(new_fd); - return -XFS_ERROR(-PTR_ERR(filp)); + put_unused_fd(fd); + return PTR_ERR(filp); } if (inode->i_mode & S_IFREG) { - /* invisible operation should not change atime */ filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(new_fd, filp); - return new_fd; + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; } /* @@ -359,11 +330,10 @@ do_readlink( int xfs_readlink_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, - struct inode *parinode) + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) { - struct inode *inode; + struct dentry *dentry; __u32 olen; void *link; int error; @@ -371,26 +341,28 @@ xfs_readlink_by_handle( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); - error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!S_ISLNK(inode->i_mode)) { + if (!S_ISLNK(dentry->d_inode->i_mode)) { error = -XFS_ERROR(EINVAL); - goto out_iput; + goto out_dput; } if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { error = -XFS_ERROR(EFAULT); - goto out_iput; + goto out_dput; } link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) - goto out_iput; + if (!link) { + error = -XFS_ERROR(ENOMEM); + goto out_dput; + } - error = -xfs_readlink(XFS_I(inode), link); + error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; error = do_readlink(hreq->ohandle, olen, link); @@ -399,32 +371,31 @@ xfs_readlink_by_handle( out_kfree: kfree(link); - out_iput: - iput(inode); + out_dput: + dput(dentry); return error; } STATIC int xfs_fssetdm_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; struct fsdmidata fsd; xfs_fsop_setdm_handlereq_t dmhreq; - struct inode *inode; + struct dentry *dentry; if (!capable(CAP_MKNOD)) return -XFS_ERROR(EPERM); if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode); - if (error) - return -error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); goto out; } @@ -434,24 +405,23 @@ xfs_fssetdm_by_handle( goto out; } - error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: - iput(inode); + dput(dentry); return error; } STATIC int xfs_attrlist_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { - int error; + int error = -ENOMEM; attrlist_cursor_kern_t *cursor; xfs_fsop_attrlist_handlereq_t al_hreq; - struct inode *inode; + struct dentry *dentry; char *kbuf; if (!capable(CAP_SYS_ADMIN)) @@ -467,16 +437,16 @@ xfs_attrlist_by_handle( if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) return -XFS_ERROR(EINVAL); - error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode); - if (error) - goto out; + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) - goto out_vn_rele; + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -486,10 +456,9 @@ xfs_attrlist_by_handle( out_kfree: kfree(kbuf); - out_vn_rele: - iput(inode); - out: - return -error; + out_dput: + dput(dentry); + return error; } int @@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove( STATIC int xfs_attrmulti_by_handle( - xfs_mount_t *mp, - void __user *arg, struct file *parfilp, - struct inode *parinode) + void __user *arg) { int error; xfs_attr_multiop_t *ops; xfs_fsop_attrmulti_handlereq_t am_hreq; - struct inode *inode; + struct dentry *dentry; unsigned int i, size; char *attr_name; @@ -581,19 +548,19 @@ xfs_attrmulti_by_handle( if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode); - if (error) - goto out; + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); error = E2BIG; size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) - goto out_vn_rele; + goto out_dput; error = ENOMEM; ops = kmalloc(size, GFP_KERNEL); if (!ops) - goto out_vn_rele; + goto out_dput; error = EFAULT; if (copy_from_user(ops, am_hreq.ops, size)) @@ -615,25 +582,28 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(inode, - attr_name, ops[i].am_attrvalue, - &ops[i].am_length, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); break; case ATTR_OP_SET: ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); if (ops[i].am_error) break; - ops[i].am_error = xfs_attrmulti_attr_set(inode, - attr_name, ops[i].am_attrvalue, - ops[i].am_length, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); if (ops[i].am_error) break; - ops[i].am_error = xfs_attrmulti_attr_remove(inode, - attr_name, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); mnt_drop_write(parfilp->f_path.mnt); break; default: @@ -647,9 +617,8 @@ xfs_attrmulti_by_handle( kfree(attr_name); out_kfree_ops: kfree(ops); - out_vn_rele: - iput(inode); - out: + out_dput: + dput(dentry); return -error; } @@ -1440,23 +1409,23 @@ xfs_file_ioctl( if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - return xfs_open_by_handle(mp, &hreq, filp, inode); + return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, inode); + return xfs_fssetdm_by_handle(filp, arg); case XFS_IOC_READLINK_BY_HANDLE: { xfs_fsop_handlereq_t hreq; if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) return -XFS_ERROR(EFAULT); - return xfs_readlink_by_handle(mp, &hreq, inode); + return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, inode); + return xfs_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, filp, inode); + return xfs_attrmulti_by_handle(filp, arg); case XFS_IOC_SWAPEXT: { struct xfs_swapext sxp; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h index 8c16bf2d7e03..7bd7c6afc1eb 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.h +++ b/fs/xfs/linux-2.6/xfs_ioctl.h @@ -34,16 +34,13 @@ xfs_find_handle( extern int xfs_open_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, struct file *parfilp, - struct inode *parinode); + xfs_fsop_handlereq_t *hreq); extern int xfs_readlink_by_handle( - xfs_mount_t *mp, - xfs_fsop_handlereq_t *hreq, - struct inode *parinode); + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); extern int xfs_attrmulti_attr_get( @@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove( char *name, __uint32_t flags); +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + extern long xfs_file_ioctl( struct file *filp, diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 50903ad3182e..fd4362063f25 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -340,96 +340,24 @@ xfs_compat_handlereq_copyin( return 0; } -/* - * Convert userspace handle data into inode. - * - * We use the fact that all the fsop_handlereq ioctl calls have a data - * structure argument whose first component is always a xfs_fsop_handlereq_t, - * so we can pass that sub structure into this handy, shared routine. - * - * If no error, caller must always iput the returned inode. - */ -STATIC int -xfs_vget_fsop_handlereq_compat( - xfs_mount_t *mp, - struct inode *parinode, /* parent inode pointer */ - compat_xfs_fsop_handlereq_t *hreq, - struct inode **inode) +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) { - void __user *hanp; - size_t hlen; - xfs_fid_t *xfid; - xfs_handle_t *handlep; - xfs_handle_t handle; - xfs_inode_t *ip; - xfs_ino_t ino; - __u32 igen; - int error; - - /* - * Only allow handle opens under a directory. - */ - if (!S_ISDIR(parinode->i_mode)) - return XFS_ERROR(ENOTDIR); - - hanp = compat_ptr(hreq->ihandle); - hlen = hreq->ihandlen; - handlep = &handle; - - if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) - return XFS_ERROR(EINVAL); - if (copy_from_user(handlep, hanp, hlen)) - return XFS_ERROR(EFAULT); - if (hlen < sizeof(*handlep)) - memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); - if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.fid_len)) || - handlep->ha_fid.fid_pad) - return XFS_ERROR(EINVAL); - } - - /* - * Crack the handle, obtain the inode # & generation # - */ - xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) { - ino = xfid->fid_ino; - igen = xfid->fid_gen; - } else { - return XFS_ERROR(EINVAL); - } - - /* - * Get the XFS inode, building a Linux inode to go with it. - */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) - return error; - if (ip == NULL) - return XFS_ERROR(EIO); - if (ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOENT); - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - *inode = VFS_I(ip); - return 0; + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); } STATIC int xfs_compat_attrlist_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; attrlist_cursor_kern_t *cursor; compat_xfs_fsop_attrlist_handlereq_t al_hreq; - struct inode *inode; + struct dentry *dentry; char *kbuf; if (!capable(CAP_SYS_ADMIN)) @@ -446,17 +374,17 @@ xfs_compat_attrlist_by_handle( if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) return -XFS_ERROR(EINVAL); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq, - &inode); - if (error) - goto out; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + error = -ENOMEM; kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) - goto out_vn_rele; + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen, + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -466,22 +394,20 @@ xfs_compat_attrlist_by_handle( out_kfree: kfree(kbuf); - out_vn_rele: - iput(inode); - out: - return -error; + out_dput: + dput(dentry); + return error; } STATIC int xfs_compat_attrmulti_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; compat_xfs_attr_multiop_t *ops; compat_xfs_fsop_attrmulti_handlereq_t am_hreq; - struct inode *inode; + struct dentry *dentry; unsigned int i, size; char *attr_name; @@ -491,20 +417,19 @@ xfs_compat_attrmulti_by_handle( sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq, - &inode); - if (error) - goto out; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); error = E2BIG; size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); if (!size || size > 16 * PAGE_SIZE) - goto out_vn_rele; + goto out_dput; error = ENOMEM; ops = kmalloc(size, GFP_KERNEL); if (!ops) - goto out_vn_rele; + goto out_dput; error = EFAULT; if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) @@ -527,20 +452,21 @@ xfs_compat_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(inode, - attr_name, + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: - ops[i].am_error = xfs_attrmulti_attr_set(inode, - attr_name, + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_REMOVE: - ops[i].am_error = xfs_attrmulti_attr_remove(inode, - attr_name, ops[i].am_flags); + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); break; default: ops[i].am_error = EINVAL; @@ -553,22 +479,20 @@ xfs_compat_attrmulti_by_handle( kfree(attr_name); out_kfree_ops: kfree(ops); - out_vn_rele: - iput(inode); - out: + out_dput: + dput(dentry); return -error; } STATIC int xfs_compat_fssetdm_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct inode *parinode) + struct file *parfilp, + void __user *arg) { int error; struct fsdmidata fsd; compat_xfs_fsop_setdm_handlereq_t dmhreq; - struct inode *inode; + struct dentry *dentry; if (!capable(CAP_MKNOD)) return -XFS_ERROR(EPERM); @@ -576,12 +500,11 @@ xfs_compat_fssetdm_by_handle( sizeof(compat_xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); - error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq, - &inode); - if (error) - return -error; + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); goto out; } @@ -591,11 +514,11 @@ xfs_compat_fssetdm_by_handle( goto out; } - error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask, + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: - iput(inode); + dput(dentry); return error; } @@ -722,21 +645,21 @@ xfs_file_compat_ioctl( if (xfs_compat_handlereq_copyin(&hreq, arg)) return -XFS_ERROR(EFAULT); - return xfs_open_by_handle(mp, &hreq, filp, inode); + return xfs_open_by_handle(filp, &hreq); } case XFS_IOC_READLINK_BY_HANDLE_32: { struct xfs_fsop_handlereq hreq; if (xfs_compat_handlereq_copyin(&hreq, arg)) return -XFS_ERROR(EFAULT); - return xfs_readlink_by_handle(mp, &hreq, inode); + return xfs_readlink_by_handle(filp, &hreq); } case XFS_IOC_ATTRLIST_BY_HANDLE_32: - return xfs_compat_attrlist_by_handle(mp, arg, inode); + return xfs_compat_attrlist_by_handle(filp, arg); case XFS_IOC_ATTRMULTI_BY_HANDLE_32: - return xfs_compat_attrmulti_by_handle(mp, arg, inode); + return xfs_compat_attrmulti_by_handle(filp, arg); case XFS_IOC_FSSETDM_BY_HANDLE_32: - return xfs_compat_fssetdm_by_handle(mp, arg, inode); + return xfs_compat_fssetdm_by_handle(filp, arg); default: return -XFS_ERROR(ENOIOCTLCMD); } -- cgit v1.2.3-59-g8ed1b From 178eae342b34185ef7b11e62df2f74ba45daa56e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:03:03 +0100 Subject: xfs: use mnt_want_write in compat_attrmulti ioctl The compat version of the attrmulti ioctl needs to ask for and then later release write access to the mount just like the native version, otherwise we could potentially write to read-only mounts. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_ioctl32.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index fd4362063f25..c70c4e3db790 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include "xfs.h" #include "xfs_fs.h" @@ -458,15 +459,23 @@ xfs_compat_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_set( dentry->d_inode, attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_remove( dentry->d_inode, attr_name, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; default: ops[i].am_error = EINVAL; -- cgit v1.2.3-59-g8ed1b From a4edd1da20af79b2e92efeee3ca94831c8024d61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:03:11 +0100 Subject: xfs: add a separate lock class for the per-mount list of dquots We can have both a a quota hash chain and the per-mount list locked at the same time. But given that both use the same struct dqhash as list head we have to tell lockdep that they are different lock classes. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_qm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 6b13960cf318..7a2beb64314f 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -1070,6 +1070,13 @@ xfs_qm_sync( return 0; } +/* + * The hash chains and the mplist use the same xfs_dqhash structure as + * their list head, but we can take the mplist qh_lock and one of the + * hash qh_locks at the same time without any problem as they aren't + * related. + */ +static struct lock_class_key xfs_quota_mplist_class; /* * This initializes all the quota information that's kept in the @@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo( } xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); + lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class); + qinf->qi_dqreclaims = 0; /* mutex used to serialize quotaoffs */ -- cgit v1.2.3-59-g8ed1b From 5bb87a33b2cfb8e7ef3383718274094bdff266a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:03:19 +0100 Subject: xfs: lockdep annotations for xfs_dqlock2 xfs_dqlock2 locks two xfs_dquots, which is fine as it always locks the dquot with the lower id first. Use mutex_lock_nested to tell lockdep about this fact. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_dquot.c | 24 ++++++++++++++---------- fs/xfs/quota/xfs_dquot.h | 10 ++++++++++ 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index d68b4e1cf1d1..ebdf3c842dce 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -1383,6 +1383,12 @@ xfs_dqunlock_nonotify( mutex_unlock(&(dqp->q_qlock)); } +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ void xfs_dqlock2( xfs_dquot_t *d1, @@ -1392,18 +1398,16 @@ xfs_dqlock2( ASSERT(d1 != d2); if (be32_to_cpu(d1->q_core.d_id) > be32_to_cpu(d2->q_core.d_id)) { - xfs_dqlock(d2); - xfs_dqlock(d1); + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { - xfs_dqlock(d1); - xfs_dqlock(d2); - } - } else { - if (d1) { - xfs_dqlock(d1); - } else if (d2) { - xfs_dqlock(d2); + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); } } diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 7e455337e2ba..d443e93b4331 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -97,6 +97,16 @@ typedef struct xfs_dquot { #define dq_hashlist q_lists.dqm_hashlist #define dq_flags q_lists.dqm_flags +/* + * Lock hierachy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + #define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) #ifdef DEBUG -- cgit v1.2.3-59-g8ed1b From 98b8c7a0c42acf0d6963dbb9aabe4a2e312aae12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:03:25 +0100 Subject: xfs: add a lock class for group/project dquots We can have both a user and a group/project dquot locked at the same time, as long as the user dquot is locked first. Tell lockdep about that fact by making the group/project dquots a different lock class. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_dquot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index ebdf3c842dce..6543c0b29753 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -73,6 +73,8 @@ int xfs_dqreq_num; int xfs_dqerror_mod = 33; #endif +static struct lock_class_key xfs_dquot_other_class; + /* * Allocate and initialize a dquot. We don't always allocate fresh memory; * we try to reclaim a free dquot if the number of incore dquots are above @@ -139,7 +141,15 @@ xfs_qm_dqinit( ASSERT(dqp->q_trace); xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); #endif - } + } + + /* + * In either case we need to make sure group quotas have a different + * lock class than user quotas, to make sure lockdep knows we can + * locks of one of each at the same time. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); /* * log item gets initialized later -- cgit v1.2.3-59-g8ed1b From 7884bc8617e6b8afda8cb8853cf14abfd3148d5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:04:07 +0100 Subject: xfs: fix bad_features2 fixups for the root filesystem Currently the bad_features2 fixup and the alignment updates in the superblock are skipped if we mount a filesystem read-only. But for the root filesystem the typical case is to mount read-only first and only later remount writeable so we'll never perform this update at all. It's not a big problem but means the logs of people needing the fixup get spammed at every boot because they never happen on disk. Reported-by: Arkadiusz Miskiewicz Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_super.c | 17 ++++++++++++++++- fs/xfs/xfs_mount.c | 26 +++++++++++++------------- fs/xfs/xfs_mount.h | 3 +++ 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 95a971080368..c71e226da7f5 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1197,6 +1197,7 @@ xfs_fs_remount( struct xfs_mount *mp = XFS_M(sb); substring_t args[MAX_OPT_ARGS]; char *p; + int error; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1247,11 +1248,25 @@ xfs_fs_remount( } } - /* rw/ro -> rw */ + /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { mp->m_flags &= ~XFS_MOUNT_RDONLY; if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_mountfs_check_barriers(mp); + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_flags) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + cmn_err(CE_WARN, + "XFS: failed to write sb changes"); + return error; + } + mp->m_update_flags = 0; + } } /* rw -> ro */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 3c97c6463a4e..35300250e86d 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,7 +45,6 @@ #include "xfs_fsops.h" #include "xfs_utils.h" -STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_unmountfs_wait(xfs_mount_t *); @@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) * Update alignment values based on mount options and sb values */ STATIC int -xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) +xfs_update_alignment(xfs_mount_t *mp) { xfs_sb_t *sbp = &(mp->m_sb); @@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags) if (xfs_sb_version_hasdalign(sbp)) { if (sbp->sb_unit != mp->m_dalign) { sbp->sb_unit = mp->m_dalign; - *update_flags |= XFS_SB_UNIT; + mp->m_update_flags |= XFS_SB_UNIT; } if (sbp->sb_width != mp->m_swidth) { sbp->sb_width = mp->m_swidth; - *update_flags |= XFS_SB_WIDTH; + mp->m_update_flags |= XFS_SB_WIDTH; } } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && @@ -905,7 +904,6 @@ xfs_mountfs( xfs_sb_t *sbp = &(mp->m_sb); xfs_inode_t *rip; __uint64_t resblks; - __int64_t update_flags = 0LL; uint quotamount, quotaflags; int uuid_mounted = 0; int error = 0; @@ -933,7 +931,7 @@ xfs_mountfs( "XFS: correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; sbp->sb_bad_features2 = sbp->sb_features2; - update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; + mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; /* * Re-check for ATTR2 in case it was found in bad_features2 @@ -947,11 +945,11 @@ xfs_mountfs( if (xfs_sb_version_hasattr2(&mp->m_sb) && (mp->m_flags & XFS_MOUNT_NOATTR2)) { xfs_sb_version_removeattr2(&mp->m_sb); - update_flags |= XFS_SB_FEATURES2; + mp->m_update_flags |= XFS_SB_FEATURES2; /* update sb_versionnum for the clearing of the morebits */ if (!sbp->sb_features2) - update_flags |= XFS_SB_VERSIONNUM; + mp->m_update_flags |= XFS_SB_VERSIONNUM; } /* @@ -960,7 +958,7 @@ xfs_mountfs( * allocator alignment is within an ag, therefore ag has * to be aligned at stripe boundary. */ - error = xfs_update_alignment(mp, &update_flags); + error = xfs_update_alignment(mp); if (error) goto error1; @@ -1137,10 +1135,12 @@ xfs_mountfs( } /* - * If fs is not mounted readonly, then update the superblock changes. + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. */ - if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { - error = xfs_mount_log_sb(mp, update_flags); + if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { cmn_err(CE_WARN, "XFS: failed to write sb changes"); goto error4; @@ -1820,7 +1820,7 @@ xfs_uuid_mount( * be altered by the mount options, as well as any potential sb_features2 * fixup. Only the first superblock is updated. */ -STATIC int +int xfs_mount_log_sb( xfs_mount_t *mp, __int64_t fields) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9bb41a9f765d..f5e9937f9bdb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -327,6 +327,8 @@ typedef struct xfs_mount { spinlock_t m_sync_lock; /* work item list lock */ int m_sync_seq; /* sync thread generation no. */ wait_queue_head_t m_wait_single_sync_task; + __int64_t m_update_flags; /* sb flags we need to update + on the next remount,rw */ } xfs_mount_t; /* @@ -512,6 +514,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); +extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -- cgit v1.2.3-59-g8ed1b From 2809f76afce05a73e08120f455107912aa519647 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Jan 2009 02:04:16 +0100 Subject: xfs: sanity check attr fork size Recently we have quite a few kerneloops reports about dereferencing a NULL if_data in the attribute fork. From looking over the code this can only happen if we pass a 0 size argument to xfs_iformat_local. This implies some sort of corruption and in fact the only mailinglist report about this from earlier this year was after a powerfail presumably on a system with write cache and without barriers. Add a quick sanity check for the attr fork size in xfs_iformat to catch these early and without an oops. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index be2ca4d67b53..e7ae08d1df48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -424,6 +424,19 @@ xfs_iformat( case XFS_DINODE_FMT_LOCAL: atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu " + "(bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); break; case XFS_DINODE_FMT_EXTENTS: -- cgit v1.2.3-59-g8ed1b From 5c5317de147e9b38ea9c4cbdc2d15bed7648d036 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:26:53 +0100 Subject: x86, ftrace, hw-branch-tracer: support hotplug cpus Support hotplug cpus. Reported-by: Andi Kleen Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 123 ++++++++++++++++++++++++++++++++++----- 1 file changed, 107 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index df21c1e72b95..398195397c75 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,7 +1,8 @@ /* * h/w branch tracer for x86 based on bts * - * Copyright (C) 2008 Markus Metzger + * Copyright (C) 2008-2009 Intel Corporation. + * Markus Metzger , 2008-2009 * */ @@ -10,6 +11,9 @@ #include #include #include +#include +#include +#include #include @@ -19,13 +23,31 @@ #define SIZEOF_BTS (1 << 13) +/* The tracer mutex protects the below per-cpu tracer array. + It needs to be held to: + - start tracing on all cpus + - stop tracing on all cpus + - start tracing on a single hotplug cpu + - stop tracing on a single hotplug cpu + - read the trace from all cpus + - read the trace from a single cpu +*/ +static DEFINE_MUTEX(bts_tracer_mutex); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) +static int __read_mostly trace_hw_branches_enabled; + +/* + * Start tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_mutex must be locked. + */ static void bts_trace_start_cpu(void *arg) { if (this_tracer) @@ -43,14 +65,20 @@ static void bts_trace_start_cpu(void *arg) static void bts_trace_start(struct trace_array *tr) { - int cpu; + mutex_lock(&bts_tracer_mutex); - tracing_reset_online_cpus(tr); + on_each_cpu(bts_trace_start_cpu, NULL, 1); + trace_hw_branches_enabled = 1; - for_each_cpu(cpu, cpu_possible_mask) - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + mutex_unlock(&bts_tracer_mutex); } +/* + * Start tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_mutex must be locked. + */ static void bts_trace_stop_cpu(void *arg) { if (this_tracer) { @@ -61,20 +89,58 @@ static void bts_trace_stop_cpu(void *arg) static void bts_trace_stop(struct trace_array *tr) { - int cpu; + mutex_lock(&bts_tracer_mutex); + + trace_hw_branches_enabled = 0; + on_each_cpu(bts_trace_stop_cpu, NULL, 1); - for_each_cpu(cpu, cpu_possible_mask) + mutex_unlock(&bts_tracer_mutex); +} + +static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + mutex_lock(&bts_tracer_mutex); + + if (!trace_hw_branches_enabled) + goto out; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + break; + case CPU_DOWN_PREPARE: smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + break; + } + + out: + mutex_unlock(&bts_tracer_mutex); + return NOTIFY_DONE; } +static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { + .notifier_call = bts_hotcpu_handler +}; + static int bts_trace_init(struct trace_array *tr) { + register_hotcpu_notifier(&bts_hotcpu_notifier); tracing_reset_online_cpus(tr); bts_trace_start(tr); return 0; } +static void bts_trace_reset(struct trace_array *tr) +{ + bts_trace_stop(tr); + unregister_hotcpu_notifier(&bts_hotcpu_notifier); +} + static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, @@ -108,18 +174,34 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) { struct ring_buffer_event *event; struct hw_branch_entry *entry; - unsigned long irq; + unsigned long irq1, irq2; + int cpu; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); - if (!event) + if (unlikely(!tr)) + return; + + if (unlikely(!trace_hw_branches_enabled)) return; + + local_irq_save(irq1); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2); + if (!event) + goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, from); entry->ent.type = TRACE_HW_BRANCHES; - entry->ent.cpu = smp_processor_id(); + entry->ent.cpu = cpu; entry->from = from; entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event, irq); + ring_buffer_unlock_commit(tr->buffer, event, irq2); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(irq1); } static void trace_bts_at(struct trace_array *tr, @@ -143,6 +225,11 @@ static void trace_bts_at(struct trace_array *tr, } } +/* + * Collect the trace on the current cpu and write it into the ftrace buffer. + * + * pre: bts_tracer_mutex must be locked + */ static void trace_bts_cpu(void *arg) { struct trace_array *tr = (struct trace_array *) arg; @@ -152,6 +239,9 @@ static void trace_bts_cpu(void *arg) if (!this_tracer) return; + if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) + return; + ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) @@ -171,17 +261,18 @@ out: static void trace_bts_prepare(struct trace_iterator *iter) { - int cpu; + mutex_lock(&bts_tracer_mutex); + + on_each_cpu(trace_bts_cpu, iter->tr, 1); - for_each_cpu(cpu, cpu_possible_mask) - smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); + mutex_unlock(&bts_tracer_mutex); } struct tracer bts_tracer __read_mostly = { .name = "hw-branch-tracer", .init = bts_trace_init, - .reset = bts_trace_stop, + .reset = bts_trace_reset, .print_header = bts_trace_print_header, .print_line = bts_trace_print_line, .start = bts_trace_start, -- cgit v1.2.3-59-g8ed1b From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:31:01 +0100 Subject: x86, ftrace, hw-branch-tracer: dump trace on oops Dump the branch trace on an oops (based on ftrace_dump_on_oops). Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 6 ++++++ include/linux/ftrace.h | 13 +++++++++++++ kernel/trace/trace.h | 1 - kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++------- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6b1f6f6f8661..077c9ea655fc 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void) int cpu; unsigned long flags; + /* notify the hw-branch tracer so it may disable tracing and + add the last trace to the trace buffer - + the earlier this happens, the more useful the trace. */ + trace_hw_branch_oops(); + oops_enter(); /* racy, but better than risking deadlock. */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 054721487574..9f7880d87c39 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) #endif /* CONFIG_TRACING */ + +#ifdef CONFIG_HW_BRANCH_TRACER + +void trace_hw_branch(u64 from, u64 to); +void trace_hw_branch_oops(void); + +#else /* CONFIG_HW_BRANCH_TRACER */ + +static inline void trace_hw_branch(u64 from, u64 to) {} +static inline void trace_hw_branch_oops(void) {} + +#endif /* CONFIG_HW_BRANCH_TRACER */ + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54b72781e920..b96037d970df 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr, void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 398195397c75..e56df2c7d679 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); #define this_buffer per_cpu(buffer, smp_processor_id()) static int __read_mostly trace_hw_branches_enabled; +static struct trace_array *hw_branch_trace __read_mostly; /* @@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { static int bts_trace_init(struct trace_array *tr) { + hw_branch_trace = tr; + register_hotcpu_notifier(&bts_hotcpu_notifier); tracing_reset_online_cpus(tr); bts_trace_start(tr); @@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) return TRACE_TYPE_UNHANDLED; } -void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) +void trace_hw_branch(u64 from, u64 to) { + struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; unsigned long irq1, irq2; @@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) local_irq_restore(irq1); } -static void trace_bts_at(struct trace_array *tr, - const struct bts_trace *trace, void *at) +static void trace_bts_at(const struct bts_trace *trace, void *at) { struct bts_struct bts; int err = 0; @@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr, switch (bts.qualifier) { case BTS_BRANCH: - trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); + trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to); break; } } @@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg) const struct bts_trace *trace; unsigned char *at; - if (!this_tracer) + if (unlikely(!tr)) return; if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled))) return; + if (unlikely(!this_tracer)) + return; + ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) @@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) - trace_bts_at(tr, trace, at); + trace_bts_at(trace, at); out: ds_resume_bts(this_tracer); @@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter) mutex_unlock(&bts_tracer_mutex); } +void trace_hw_branch_oops(void) +{ + mutex_lock(&bts_tracer_mutex); + + trace_bts_cpu(hw_branch_trace); + + mutex_unlock(&bts_tracer_mutex); +} + struct tracer bts_tracer __read_mostly = { .name = "hw-branch-tracer", -- cgit v1.2.3-59-g8ed1b From e23b8ad83430a6fdfbdbfac365f5b0312dd57f10 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:33:31 +0100 Subject: x86, ftrace, hw-branch-tracer: reset trace buffer on close Reset the ftrace buffer on close. Since we use cyclic buffers, the trace is not contiguous, anyway. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e56df2c7d679..372b47ac3154 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -274,6 +274,11 @@ static void trace_bts_prepare(struct trace_iterator *iter) mutex_unlock(&bts_tracer_mutex); } +static void trace_bts_close(struct trace_iterator *iter) +{ + tracing_reset_online_cpus(iter->tr); +} + void trace_hw_branch_oops(void) { mutex_lock(&bts_tracer_mutex); @@ -292,7 +297,8 @@ struct tracer bts_tracer __read_mostly = .print_line = bts_trace_print_line, .start = bts_trace_start, .stop = bts_trace_stop, - .open = trace_bts_prepare + .open = trace_bts_prepare, + .close = trace_bts_close }; __init static int init_bts_trace(void) -- cgit v1.2.3-59-g8ed1b From 11edda06289d412d13ff7c672bd72e043f637e74 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:29:16 +0100 Subject: x86, ftrace, hw-branch-tracer: change trace format Change the hw-branch-tracer format to be more readable. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 372b47ac3154..fff3545fc866 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -146,10 +146,7 @@ static void bts_trace_reset(struct trace_array *tr) static void bts_trace_print_header(struct seq_file *m) { - seq_puts(m, - "# CPU# FROM TO FUNCTION\n"); - seq_puts(m, - "# | | | |\n"); + seq_puts(m, "# CPU# TO <- FROM\n"); } static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) @@ -157,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; + unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); if (entry->type == TRACE_HW_BRANCHES) { if (trace_seq_printf(seq, "%4d ", entry->cpu) && - trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", - it->from, it->to) && - (!it->from || - seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) && + seq_print_ip_sym(seq, it->to, symflags) && + trace_seq_printf(seq, "\t <- ") && + seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; return TRACE_TYPE_PARTIAL_LINE;; -- cgit v1.2.3-59-g8ed1b From e2ea5399bb4fb7aaafb08f846db453f4eec55160 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:35:58 +0100 Subject: x86, ftrace, hw-branch-tracer: documentation Document the hw-branch-tracer in the ftrace documentation. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 803b1318b13d..758fb42a1b68 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -165,6 +165,8 @@ Here is the list of current tracers that may be configured. nop - This is not a tracer. To remove all tracers from tracing simply echo "nop" into current_tracer. + hw-branch-tracer - traces branches on all cpu's in a circular buffer. + Examples of using the tracer ---------------------------- @@ -1152,6 +1154,78 @@ int main (int argc, char **argv) return 0; } + +hw-branch-tracer (x86 only) +--------------------------- + +This tracer uses the x86 last branch tracing hardware feature to +collect a branch trace on all cpus with relatively low overhead. + +The tracer uses a fixed-size circular buffer per cpu and only +traces ring 0 branches. The trace file dumps that buffer in the +following format: + +# tracer: hw-branch-tracer +# +# CPU# TO <- FROM + 0 scheduler_tick+0xb5/0x1bf <- task_tick_idle+0x5/0x6 + 2 run_posix_cpu_timers+0x2b/0x72a <- run_posix_cpu_timers+0x25/0x72a + 0 scheduler_tick+0x139/0x1bf <- scheduler_tick+0xed/0x1bf + 0 scheduler_tick+0x17c/0x1bf <- scheduler_tick+0x148/0x1bf + 2 run_posix_cpu_timers+0x9e/0x72a <- run_posix_cpu_timers+0x5e/0x72a + 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf + + +The tracer may be used to dump the trace for the oops'ing cpu on a +kernel oops into the system log. To enable this, ftrace_dump_on_oops +must be set. To set ftrace_dump_on_oops, one can either use the sysctl +function or set it via the proc system interface. + + sysctl kernel.ftrace_dump_on_oops=1 + +or + + echo 1 > /proc/sys/kernel/ftrace_dump_on_oops + + +Here's an example of such a dump after a null pointer dereference in a +kernel module: + +[57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 +[57848.106019] IP: [] open+0x6/0x14 [oops] +[57848.106019] PGD 2354e9067 PUD 2375e7067 PMD 0 +[57848.106019] Oops: 0002 [#1] SMP +[57848.106019] last sysfs file: /sys/devices/pci0000:00/0000:00:1e.0/0000:20:05.0/local_cpus +[57848.106019] Dumping ftrace buffer: +[57848.106019] --------------------------------- +[...] +[57848.106019] 0 chrdev_open+0xe6/0x165 <- cdev_put+0x23/0x24 +[57848.106019] 0 chrdev_open+0x117/0x165 <- chrdev_open+0xfa/0x165 +[57848.106019] 0 chrdev_open+0x120/0x165 <- chrdev_open+0x11c/0x165 +[57848.106019] 0 chrdev_open+0x134/0x165 <- chrdev_open+0x12b/0x165 +[57848.106019] 0 open+0x0/0x14 [oops] <- chrdev_open+0x144/0x165 +[57848.106019] 0 page_fault+0x0/0x30 <- open+0x6/0x14 [oops] +[57848.106019] 0 error_entry+0x0/0x5b <- page_fault+0x4/0x30 +[57848.106019] 0 error_kernelspace+0x0/0x31 <- error_entry+0x59/0x5b +[57848.106019] 0 error_sti+0x0/0x1 <- error_kernelspace+0x2d/0x31 +[57848.106019] 0 page_fault+0x9/0x30 <- error_sti+0x0/0x1 +[57848.106019] 0 do_page_fault+0x0/0x881 <- page_fault+0x1a/0x30 +[...] +[57848.106019] 0 do_page_fault+0x66b/0x881 <- is_prefetch+0x1ee/0x1f2 +[57848.106019] 0 do_page_fault+0x6e0/0x881 <- do_page_fault+0x67a/0x881 +[57848.106019] 0 oops_begin+0x0/0x96 <- do_page_fault+0x6e0/0x881 +[57848.106019] 0 trace_hw_branch_oops+0x0/0x2d <- oops_begin+0x9/0x96 +[...] +[57848.106019] 0 ds_suspend_bts+0x2a/0xe3 <- ds_suspend_bts+0x1a/0xe3 +[57848.106019] --------------------------------- +[57848.106019] CPU 0 +[57848.106019] Modules linked in: oops +[57848.106019] Pid: 5542, comm: cat Tainted: G W 2.6.28 #23 +[57848.106019] RIP: 0010:[] [] open+0x6/0x14 [oops] +[57848.106019] RSP: 0018:ffff880235457d48 EFLAGS: 00010246 +[...] + + dynamic ftrace -------------- -- cgit v1.2.3-59-g8ed1b From ce5e5540c0e839781e7cd134517d5d2e9e819636 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Mon, 19 Jan 2009 10:38:35 +0100 Subject: x86, ds, bts: cleanup DS configuration Cleanup the cpuid check for DS configuration. Signed-off-by: Markus Metzger Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index da91701a2348..169a120587be 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger , 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger , 2007-2009 */ @@ -890,7 +890,7 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) } static const struct ds_configuration ds_cfg_netburst = { - .name = "netburst", + .name = "Netburst", .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), @@ -904,7 +904,7 @@ static const struct ds_configuration ds_cfg_netburst = { #endif }; static const struct ds_configuration ds_cfg_pentium_m = { - .name = "pentium m", + .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .sizeof_field = sizeof(long), @@ -915,8 +915,8 @@ static const struct ds_configuration ds_cfg_pentium_m = { .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_core2 = { - .name = "core 2", +static const struct ds_configuration ds_cfg_core2_atom = { + .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), @@ -949,19 +949,22 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { - case 0 ... 0xC: - /* sorry, don't know about them */ - break; - case 0xD: - case 0xE: /* Pentium M */ + case 0x9: + case 0xd: /* Pentium M */ ds_configure(&ds_cfg_pentium_m); break; - default: /* Core2, Atom, ... */ - ds_configure(&ds_cfg_core2); + case 0xf: + case 0x17: /* Core2 */ + case 0x1c: /* Atom */ + ds_configure(&ds_cfg_core2_atom); + break; + case 0x1a: /* i7 */ + default: + /* sorry, don't know about them */ break; } break; - case 0xF: + case 0xf: switch (c->x86_model) { case 0x0: case 0x1: -- cgit v1.2.3-59-g8ed1b From 3690b5e6fd9daa030039ae9bda69044228bd476d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 16 Jan 2009 16:32:25 +0800 Subject: trace_workqueue: use percpu data for workqueue stat Impact: use percpu data instead of a global structure Use: static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); instead of allocating a global structure. percpu data also works well on NUMA. Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 64 +++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index f8118d39ca9b..4664990fe9c5 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -8,6 +8,7 @@ #include #include +#include #include "trace_stat.h" #include "trace.h" @@ -37,7 +38,8 @@ struct workqueue_global_stats { /* Don't need a global lock because allocated before the workqueues, and * never freed. */ -static struct workqueue_global_stats *all_workqueue_stat; +static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); +#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) /* Insertion of a work */ static void @@ -48,8 +50,8 @@ probe_workqueue_insertion(struct task_struct *wq_thread, struct cpu_workqueue_stats *node, *next; unsigned long flags; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { atomic_inc(&node->inserted); @@ -58,7 +60,7 @@ probe_workqueue_insertion(struct task_struct *wq_thread, } pr_debug("trace_workqueue: entry not found\n"); found: - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } /* Execution of a work */ @@ -70,8 +72,8 @@ probe_workqueue_execution(struct task_struct *wq_thread, struct cpu_workqueue_stats *node, *next; unsigned long flags; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { node->executed++; @@ -80,7 +82,7 @@ probe_workqueue_execution(struct task_struct *wq_thread, } pr_debug("trace_workqueue: entry not found\n"); found: - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } /* Creation of a cpu workqueue thread */ @@ -104,11 +106,11 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) cws->pid = wq_thread->pid; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - if (list_empty(&all_workqueue_stat[cpu].list)) + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (list_empty(&workqueue_cpu_stat(cpu)->list)) cws->first_entry = true; - list_add_tail(&cws->list, &all_workqueue_stat[cpu].list); - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } /* Destruction of a cpu workqueue thread */ @@ -119,8 +121,8 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) struct cpu_workqueue_stats *node, *next; unsigned long flags; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list, + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { list_del(&node->list); @@ -131,7 +133,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread) pr_debug("trace_workqueue: don't find workqueue to destroy\n"); found: - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } @@ -141,13 +143,13 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) struct cpu_workqueue_stats *ret = NULL; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (!list_empty(&all_workqueue_stat[cpu].list)) - ret = list_entry(all_workqueue_stat[cpu].list.next, + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) + ret = list_entry(workqueue_cpu_stat(cpu)->list.next, struct cpu_workqueue_stats, list); - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); return ret; } @@ -172,9 +174,9 @@ static void *workqueue_stat_next(void *prev, int idx) unsigned long flags; void *ret = NULL; - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) { - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); for (++cpu ; cpu < num_possible_cpus(); cpu++) { ret = workqueue_stat_start_cpu(cpu); if (ret) @@ -182,7 +184,7 @@ static void *workqueue_stat_next(void *prev, int idx) } return NULL; } - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, list); @@ -199,10 +201,10 @@ static int workqueue_stat_show(struct seq_file *s, void *p) cws->executed, trace_find_cmdline(cws->pid)); - spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags); - if (&cws->list == all_workqueue_stat[cpu].list.next) + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (&cws->list == workqueue_cpu_stat(cpu)->list.next) seq_printf(s, "\n"); - spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); return 0; } @@ -258,17 +260,9 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats) - * num_possible_cpus(), GFP_KERNEL); - - if (!all_workqueue_stat) { - pr_warning("trace_workqueue: not enough memory\n"); - goto no_creation; - } - for_each_possible_cpu(cpu) { - spin_lock_init(&all_workqueue_stat[cpu].lock); - INIT_LIST_HEAD(&all_workqueue_stat[cpu].list); + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); } return 0; -- cgit v1.2.3-59-g8ed1b From 33ad965dde197a016083438359c3163e1aca9ada Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 21 Jan 2009 15:22:17 +1100 Subject: Long btree pointers are still 64 bit on disk [XFS] Long btree pointers are still 64 bit on disk On 32 bit machines with CONFIG_LBD=n, XFS reduces the in memory size of xfs_fsblock_t to 32 bits so that it will fit within 32 bit addressing. However, the disk format for long btree pointers are still 64 bits in size. The recent btree rewrite failed to take this into account when initialising new btree blocks, setting sibling pointers to NULL and checking if they are NULL. Hence checking whether a 64 bit NULL was the same as a 32 bit NULL was failingi resulting in NULL sibling pointers failing to be detected correctly. This showed up as WANT_CORRUPTED_GOTO shutdowns in xfs_btree_delrec. Fix this by making all the comparisons and setting of long pointer btree NULL blocks to the disk format, not the in memory format. i.e. use NULLDFSBNO. Reported-by: Alexander Beregalov Reported-by: Jacek Luczak Reported-by: Danny ter Haar Tested-by: Jacek Luczak Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner Signed-off-by: Felix Blyakher --- fs/xfs/xfs_btree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 4681519ded91..e73c332eb23f 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -843,7 +843,7 @@ xfs_btree_ptr_is_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - return be64_to_cpu(ptr->l) == NULLFSBLOCK; + return be64_to_cpu(ptr->l) == NULLDFSBNO; else return be32_to_cpu(ptr->s) == NULLAGBLOCK; } @@ -854,7 +854,7 @@ xfs_btree_set_ptr_null( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - ptr->l = cpu_to_be64(NULLFSBLOCK); + ptr->l = cpu_to_be64(NULLDFSBNO); else ptr->s = cpu_to_be32(NULLAGBLOCK); } @@ -918,8 +918,8 @@ xfs_btree_init_block( new->bb_numrecs = cpu_to_be16(numrecs); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); - new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); + new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); } else { new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); @@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr( union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK); + ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO); return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); } else { -- cgit v1.2.3-59-g8ed1b From 5253a11a8187b6329dda41219d9eece971c06e19 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 1 Jan 2009 16:50:27 -0600 Subject: [XFS] remove always-true #ifndef HAVE_FORMAT32 tests There are several tests for #ifndef HAVE_FORMAT32, but this is never defined anywhere so it is always the default behavior; just remove the ifndef goop. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- fs/xfs/xfs_extfree_item.h | 6 ------ fs/xfs/xfs_inode_item.h | 2 -- 2 files changed, 8 deletions(-) diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 2f049f63e85f..0d22c56fdf64 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -33,12 +33,10 @@ typedef struct xfs_extent { * conversion routine. */ -#ifndef HAVE_FORMAT32 typedef struct xfs_extent_32 { __uint64_t ext_start; __uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; -#endif typedef struct xfs_extent_64 { __uint64_t ext_start; @@ -59,7 +57,6 @@ typedef struct xfs_efi_log_format { xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_efi_log_format_32 { __uint16_t efi_type; /* efi log item type */ __uint16_t efi_size; /* size of this item */ @@ -67,7 +64,6 @@ typedef struct xfs_efi_log_format_32 { __uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; -#endif typedef struct xfs_efi_log_format_64 { __uint16_t efi_type; /* efi log item type */ @@ -90,7 +86,6 @@ typedef struct xfs_efd_log_format { xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_efd_log_format_32 { __uint16_t efd_type; /* efd log item type */ __uint16_t efd_size; /* size of this item */ @@ -98,7 +93,6 @@ typedef struct xfs_efd_log_format_32 { __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; -#endif typedef struct xfs_efd_log_format_64 { __uint16_t efd_type; /* efd log item type */ diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 9957d0602d54..a52ac125f055 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -40,7 +40,6 @@ typedef struct xfs_inode_log_format { __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; -#ifndef HAVE_FORMAT32 typedef struct xfs_inode_log_format_32 { __uint16_t ilf_type; /* inode log item type */ __uint16_t ilf_size; /* size of this item */ @@ -56,7 +55,6 @@ typedef struct xfs_inode_log_format_32 { __int32_t ilf_len; /* len of inode buffer */ __int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; -#endif typedef struct xfs_inode_log_format_64 { __uint16_t ilf_type; /* inode log item type */ -- cgit v1.2.3-59-g8ed1b From 5bc4564b224c3d9fe6dddafa25f56059bd978231 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 14:36:52 -0500 Subject: trace: do not disable wake up tracer on output of trace Impact: fix to erased trace output To try not to have the outputing of a trace interfere with the wakeup tracer, it would disable tracing while the output was printing. But if a trace had started when it was disabled, it can show a partial trace. To try to solve this, on closing of the tracer, it would clear the trace buffer. The latency tracers (wakeup and irqsoff) have two buffers. One for recording and one for holding the max trace that is printed. The clearing of the trace above should only affect the recording buffer. But for some reason it would move the erased trace to the print buffer. Probably due to a race with the closing of the trace and the saving ofhe max race. The above is all pretty useless, and if the user does not want the printing of the trace to be traced itself, then the user can manual disable tracing. This patch removes all the code that tries to keep the output of the tracer from modifying the trace. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 42ae1e77b6b3..e27adef0171a 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -262,12 +262,6 @@ out: atomic_dec(&wakeup_trace->data[cpu]->disabled); } -/* - * save_tracer_enabled is used to save the state of the tracer_enabled - * variable when we disable it when we open a trace output file. - */ -static int save_tracer_enabled; - static void start_wakeup_tracer(struct trace_array *tr) { int ret; @@ -306,13 +300,10 @@ static void start_wakeup_tracer(struct trace_array *tr) register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) { + if (tracing_is_enabled()) tracer_enabled = 1; - save_tracer_enabled = 1; - } else { + else tracer_enabled = 0; - save_tracer_enabled = 0; - } return; fail_deprobe_wake_new: @@ -324,7 +315,6 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); unregister_trace_sched_switch(probe_wakeup_sched_switch); unregister_trace_sched_wakeup_new(probe_wakeup); @@ -350,28 +340,11 @@ static void wakeup_tracer_start(struct trace_array *tr) { wakeup_reset(tr); tracer_enabled = 1; - save_tracer_enabled = 1; } static void wakeup_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; -} - -static void wakeup_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - tracer_enabled = 0; -} - -static void wakeup_tracer_close(struct trace_iterator *iter) -{ - /* forget about any processes we were recording */ - if (save_tracer_enabled) { - wakeup_reset(iter->tr); - tracer_enabled = 1; - } } static struct tracer wakeup_tracer __read_mostly = @@ -381,8 +354,6 @@ static struct tracer wakeup_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, - .open = wakeup_tracer_open, - .close = wakeup_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, -- cgit v1.2.3-59-g8ed1b From 97b17efe4537e11bf6669106cfe4ee2c5331b267 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 15:24:56 -0500 Subject: ring-buffer: do not swap if recording is disabled If the ring buffer recording has been disabled. Do not let swapping of ring buffers occur. Simply return -EAGAIN. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 15 +++++++++++++++ kernel/trace/trace.c | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0b9de5a3d699..890020e28a35 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2266,9 +2266,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, if (buffer_a->pages != buffer_b->pages) return -EINVAL; + if (ring_buffer_flags != RB_BUFFERS_ON) + return -EAGAIN; + + if (atomic_read(&buffer_a->record_disabled)) + return -EAGAIN; + + if (atomic_read(&buffer_b->record_disabled)) + return -EAGAIN; + cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; + if (atomic_read(&cpu_buffer_a->record_disabled)) + return -EAGAIN; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + return -EAGAIN; + /* * We can't do a synchronize_sched here because this * function can be called in atomic context. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 220c264e3111..757ae6f7e648 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -415,7 +415,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) ftrace_enable_cpu(); - WARN_ON_ONCE(ret); + WARN_ON_ONCE(ret && ret != -EAGAIN); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); -- cgit v1.2.3-59-g8ed1b From 3244351c31211a8b1ba8b4b34c3de04d5dfa03e4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 16:24:46 -0500 Subject: trace: separate out rt tasks from wakeup tracer Impact: add option to trace all tasks or just RT tasks The current wakeup tracer only traces RT task wakeups. This is fine for those interested in wake up timings of RT tasks, but it is useless for those that are interested in the causes of long wakeups for non RT tasks. This patch creates a "wakeup_rt" to implement the tracing of just RT tasks (as the current "wakeup" does). And makes "wakeup" now trace all tasks as an average developer would expect. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e27adef0171a..f48957886102 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -25,6 +25,7 @@ static int __read_mostly tracer_enabled; static struct task_struct *wakeup_task; static int wakeup_cpu; static unsigned wakeup_prio = -1; +static int wakeup_rt; static raw_spinlock_t wakeup_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -224,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if (likely(!rt_task(p)) || + if ((wakeup_rt && !rt_task(p)) || p->prio >= wakeup_prio || p->prio >= current->prio) return; @@ -321,7 +322,7 @@ static void stop_wakeup_tracer(struct trace_array *tr) unregister_trace_sched_wakeup(probe_wakeup); } -static int wakeup_tracer_init(struct trace_array *tr) +static int __wakeup_tracer_init(struct trace_array *tr) { tracing_max_latency = 0; wakeup_trace = tr; @@ -329,6 +330,18 @@ static int wakeup_tracer_init(struct trace_array *tr) return 0; } +static int wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static int wakeup_rt_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 1; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { stop_wakeup_tracer(tr); @@ -360,6 +373,19 @@ static struct tracer wakeup_tracer __read_mostly = #endif }; +static struct tracer wakeup_rt_tracer __read_mostly = +{ + .name = "wakeup_rt", + .init = wakeup_rt_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = 1, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif +}; + __init static int init_wakeup_tracer(void) { int ret; @@ -368,6 +394,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_rt_tracer); + if (ret) + return ret; + return 0; } device_initcall(init_wakeup_tracer); -- cgit v1.2.3-59-g8ed1b From f8ec1062f589cdb1cffcffab1376124a1bc08500 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 17:17:04 -0500 Subject: wakeup-tracer: show scheduling data in output Impact: better data for wakeup tracer This patch adds the wakeup and schedule calls that are used by the scheduler tracer to make the wakeup tracer more readable. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f48957886102..93cecda650b2 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -153,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, goto out_unlock; trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -214,6 +215,7 @@ static void wakeup_reset(struct trace_array *tr) static void probe_wakeup(struct rq *rq, struct task_struct *p, int success) { + struct trace_array_cpu *data; int cpu = smp_processor_id(); unsigned long flags; long disabled; @@ -253,9 +255,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) local_save_flags(flags); - wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags, pc); + data = wakeup_trace->data[wakeup_cpu]; + data->preempt_timestamp = ftrace_now(cpu); + tracing_sched_wakeup_trace(wakeup_trace, data, p, current, + flags, pc); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, + flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); -- cgit v1.2.3-59-g8ed1b From 69507c06539332e6e49f83aa478844130233bece Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Jan 2009 18:45:57 -0500 Subject: ring-buffer: reset timestamps when ring buffer is reset Impact: fix bad times of recent resets The ring buffer needs to reset its timestamps when reseting of the buffer, otherwise the timestamps are stale and might be used to calculate times in the buffer causing funny timestamps to appear. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 890020e28a35..7839280ffcd8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2166,6 +2166,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->overrun = 0; cpu_buffer->entries = 0; + + cpu_buffer->write_stamp = 0; + cpu_buffer->read_stamp = 0; } /** -- cgit v1.2.3-59-g8ed1b From 4c50d9ea9ca9e46b65aeffed3e0d6f54ff38c8d4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 22 Jan 2009 14:14:14 +0100 Subject: cpumask: modifiy oprofile initialization Delta patch to f7df8ed164996cd2c6aca9674388be6ef78d8b37 for tip/cpus4096. Moved initialization to sync_start()/sync_stop(). No changes needed in buffer_sync.h and oprof.c anymore. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- drivers/oprofile/buffer_sync.c | 20 ++++++-------------- drivers/oprofile/buffer_sync.h | 4 ---- drivers/oprofile/oprof.c | 9 +-------- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index c3ea5fa7d05a..8574622e36a5 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -154,6 +154,10 @@ int sync_start(void) { int err; + if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(marked_cpus); + start_cpu_work(); err = task_handoff_register(&task_free_nb); @@ -179,6 +183,7 @@ out2: task_handoff_unregister(&task_free_nb); out1: end_sync(); + free_cpumask_var(marked_cpus); goto out; } @@ -190,6 +195,7 @@ void sync_stop(void) profile_event_unregister(PROFILE_TASK_EXIT, &task_exit_nb); task_handoff_unregister(&task_free_nb); end_sync(); + free_cpumask_var(marked_cpus); } @@ -565,20 +571,6 @@ void sync_buffer(int cpu) mutex_unlock(&buffer_mutex); } -int __init buffer_sync_init(void) -{ - if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL)) - return -ENOMEM; - - cpumask_clear(marked_cpus); - return 0; -} - -void __exit buffer_sync_cleanup(void) -{ - free_cpumask_var(marked_cpus); -} - /* The function can be used to add a buffer worth of data directly to * the kernel buffer. The buffer is assumed to be a circular buffer. * Take the entries from index start and end at index end, wrapping diff --git a/drivers/oprofile/buffer_sync.h b/drivers/oprofile/buffer_sync.h index 0ebf5db62679..3110732c1835 100644 --- a/drivers/oprofile/buffer_sync.h +++ b/drivers/oprofile/buffer_sync.h @@ -19,8 +19,4 @@ void sync_stop(void); /* sync the given CPU's buffer */ void sync_buffer(int cpu); -/* initialize/destroy the buffer system. */ -int buffer_sync_init(void); -void buffer_sync_cleanup(void); - #endif /* OPROFILE_BUFFER_SYNC_H */ diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index ced39f602292..3cffce90f82a 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -183,10 +183,6 @@ static int __init oprofile_init(void) { int err; - err = buffer_sync_init(); - if (err) - return err; - err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { @@ -195,10 +191,8 @@ static int __init oprofile_init(void) } err = oprofilefs_register(); - if (err) { + if (err) oprofile_arch_exit(); - buffer_sync_cleanup(); - } return err; } @@ -208,7 +202,6 @@ static void __exit oprofile_exit(void) { oprofilefs_unregister(); oprofile_arch_exit(); - buffer_sync_cleanup(); } -- cgit v1.2.3-59-g8ed1b From 94523e818f283d3c69f621406f633afff46dbf82 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 11:18:06 -0500 Subject: trace: remove internal irqsoff disabling for trace output Impact: cleanup of duplicate features The trace output disables the ring buffer and prevents tracing to occur. The code in irqsoff to do the same thing is no longer needed. This patch removes it. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_irqsoff.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 62a78d943534..ed344b022a14 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -353,28 +353,18 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -/* - * save_tracer_enabled is used to save the state of the tracer_enabled - * variable when we disable it when we open a trace output file. - */ -static int save_tracer_enabled; - static void start_irqsoff_tracer(struct trace_array *tr) { register_ftrace_function(&trace_ops); - if (tracing_is_enabled()) { + if (tracing_is_enabled()) tracer_enabled = 1; - save_tracer_enabled = 1; - } else { + else tracer_enabled = 0; - save_tracer_enabled = 0; - } } static void stop_irqsoff_tracer(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; unregister_ftrace_function(&trace_ops); } @@ -395,25 +385,11 @@ static void irqsoff_tracer_reset(struct trace_array *tr) static void irqsoff_tracer_start(struct trace_array *tr) { tracer_enabled = 1; - save_tracer_enabled = 1; } static void irqsoff_tracer_stop(struct trace_array *tr) { tracer_enabled = 0; - save_tracer_enabled = 0; -} - -static void irqsoff_tracer_open(struct trace_iterator *iter) -{ - /* stop the trace while dumping */ - tracer_enabled = 0; -} - -static void irqsoff_tracer_close(struct trace_iterator *iter) -{ - /* restart tracing */ - tracer_enabled = save_tracer_enabled; } #ifdef CONFIG_IRQSOFF_TRACER @@ -431,8 +407,6 @@ static struct tracer irqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, @@ -459,8 +433,6 @@ static struct tracer preemptoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, @@ -489,8 +461,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .reset = irqsoff_tracer_reset, .start = irqsoff_tracer_start, .stop = irqsoff_tracer_stop, - .open = irqsoff_tracer_open, - .close = irqsoff_tracer_close, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, -- cgit v1.2.3-59-g8ed1b From b06a830183b610c0a88c29a92feb7991a867ab46 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 14:26:15 -0500 Subject: trace: fix logic to start/stop counting The logic in the tracing_start/stop code prevents the WARN_ON from ever detecting if a start/stop pair was mismatched. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 757ae6f7e648..2129ab9d2a48 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -610,13 +610,12 @@ void tracing_start(void) return; spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) - goto out; - - if (trace_stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - trace_stop_count = 0; + if (--trace_stop_count) { + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + } goto out; } -- cgit v1.2.3-59-g8ed1b From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 22 Jan 2009 19:01:40 -0500 Subject: trace, lockdep: manual preempt count adding for local_bh_disable Impact: fix to preempt trace triggering lockdep check_flag failure In local_bh_disable, the use of add_preempt_count causes the preempt tracer to start recording the time preemption is off. But because it already modified the preempt_count to show softirqs disabled, and before it called the lockdep code to handle this, it causes a state that lockdep can not handle. The preempt tracer will reset the ring buffer on start of a trace, and the ring buffer reset code does a spin_lock_irqsave. This calls into lockdep and lockdep will fail when it detects the invalid state of having softirqs disabled but the internal current->softirqs_enabled is still set. The fix is to manually add the SOFTIRQ_OFFSET to preempt count and call the preempt tracer code outside the lockdep critical area. Thanks to Peter Zijlstra for suggesting this solution. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 8 ++++---- kernel/softirq.c | 13 ++++++++++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f8..33085b88f87b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern unsigned long get_parent_ip(unsigned long addr); + struct seq_file; struct cfs_rq; struct task_group; diff --git a/kernel/sched.c b/kernel/sched.c index 52bbf1c842a8..c154825ae753 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4399,10 +4399,7 @@ void scheduler_tick(void) #endif } -#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ - defined(CONFIG_PREEMPT_TRACER)) - -static inline unsigned long get_parent_ip(unsigned long addr) +unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { addr = CALLER_ADDR2; @@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr) return addr; } +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + void __kprobes add_preempt_count(int val) { #ifdef CONFIG_DEBUG_PREEMPT diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..6edfc2c11d99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip) WARN_ON_ONCE(in_irq()); raw_local_irq_save(flags); - add_preempt_count(SOFTIRQ_OFFSET); + /* + * The preempt tracer hooks into add_preempt_count and will break + * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET + * is set and before current->softirq_enabled is cleared. + * We must manually increment preempt_count here and manually + * call the trace_preempt_off later. + */ + preempt_count() += SOFTIRQ_OFFSET; /* * Were softirqs turned off above: */ if (softirq_count() == SOFTIRQ_OFFSET) trace_softirqs_off(ip); raw_local_irq_restore(flags); + + if (preempt_count() == SOFTIRQ_OFFSET) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ static inline void __local_bh_disable(unsigned long ip) -- cgit v1.2.3-59-g8ed1b From 9005f3ebebfcfe9ccd731d16c468907a35ac1f9a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Jan 2009 17:04:53 -0800 Subject: tracing/function-graph-tracer: various fixes and features This patch brings various bugfixes: - Drop the first irrelevant task switch on the very beginning of a trace. - Drop the OVERHEAD word from the headers, the DURATION word is sufficient and will not overlap other columns. - Make the headers fit well their respective columns whatever the selected options. Ie, default options: # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 1) 0.646 us | } 1) | mem_cgroup_del_lru_list() { 1) 0.624 us | lookup_page_cgroup(); 1) 1.970 us | } echo funcgraph-proc > trace_options # tracer: function_graph # # CPU TASK/PID DURATION FUNCTION CALLS # | | | | | | | | | 0) bash-2937 | 0.895 us | } 0) bash-2937 | 0.888 us | __rcu_read_unlock(); 0) bash-2937 | 0.864 us | conv_uni_to_pc(); 0) bash-2937 | 1.015 us | __rcu_read_lock(); echo nofuncgraph-cpu > trace_options echo nofuncgraph-proc > trace_options # tracer: function_graph # # DURATION FUNCTION CALLS # | | | | | | 3.752 us | native_pud_val(); 0.616 us | native_pud_val(); 0.624 us | native_pmd_val(); About features, one can now disable the duration (this will hide the overhead too for convenient reasons and because on doesn't need overhead if it hasn't the duration): echo nofuncgraph-duration > trace_options # tracer: function_graph # # FUNCTION CALLS # | | | | cap_vm_enough_memory() { __vm_enough_memory() { vm_acct_memory(); } } } And at last, an option to print the absolute time: //Restart from default options echo funcgraph-abstime > trace_options # tracer: function_graph # # TIME CPU DURATION FUNCTION CALLS # | | | | | | | | 261.339774 | 1) + 42.823 us | } 261.339775 | 1) 1.045 us | _spin_lock_irq(); 261.339777 | 1) 0.940 us | _spin_lock_irqsave(); 261.339778 | 1) 0.752 us | _spin_unlock_irqrestore(); 261.339780 | 1) 0.857 us | _spin_unlock_irq(); 261.339782 | 1) | flush_to_ldisc() { 261.339783 | 1) | tty_ldisc_ref() { 261.339783 | 1) | tty_ldisc_try() { 261.339784 | 1) 1.075 us | _spin_lock_irqsave(); 261.339786 | 1) 0.842 us | _spin_unlock_irqrestore(); 261.339788 | 1) 4.211 us | } 261.339788 | 1) 5.662 us | } The format is seconds.usecs. I guess no one needs the nanosec precision here, the main goal is to have an overview about the general timings of events, and to see the place when the trace switches from one cpu to another. ie: 274.874760 | 1) 0.676 us | _spin_unlock(); 274.874762 | 1) 0.609 us | native_load_sp0(); 274.874763 | 1) 0.602 us | native_load_tls(); 274.878739 | 0) 0.722 us | } 274.878740 | 0) 0.714 us | native_pmd_val(); 274.878741 | 0) 0.730 us | native_pmd_val(); Here there is a 4000 usecs difference when we switch the cpu. Changes in V2: - Completely fix the first pointless task switch. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 298 +++++++++++++++++++++++------------ 1 file changed, 195 insertions(+), 103 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3c545984816f..66fc7b806a38 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1,7 +1,7 @@ /* * * Function graph tracer. - * Copyright (c) 2008 Frederic Weisbecker + * Copyright (c) 2008-2009 Frederic Weisbecker * Mostly borrowed from function tracer which * is Copyright (c) Steven Rostedt * @@ -21,9 +21,11 @@ #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 static struct tracer_opt trace_opts[] = { - /* Display overruns ? */ + /* Display overruns? (for self-debug purpose) */ { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, /* Display CPU ? */ { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, @@ -31,17 +33,22 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, /* Display proc name/pid */ { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + /* Display duration of execution */ + { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, + /* Display absolute time of an entry */ + { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ - .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | + TRACE_GRAPH_PRINT_DURATION, .opts = trace_opts }; /* pid on the last trace processed */ -static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; + static int graph_trace_init(struct trace_array *tr) { @@ -154,17 +161,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* If the pid changed since the last trace, output this event */ static enum print_line_t -verif_pid(struct trace_seq *s, pid_t pid, int cpu) +verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) { pid_t prev_pid; + pid_t *last_pid; int ret; - if (last_pid[cpu] != -1 && last_pid[cpu] == pid) + if (!last_pids_cpu) + return TRACE_TYPE_HANDLED; + + last_pid = per_cpu_ptr(last_pids_cpu, cpu); + + if (*last_pid == pid) return TRACE_TYPE_HANDLED; - prev_pid = last_pid[cpu]; - last_pid[cpu] = pid; + prev_pid = *last_pid; + *last_pid = pid; + if (prev_pid == -1) + return TRACE_TYPE_HANDLED; /* * Context-switch trace line: @@ -232,9 +247,34 @@ trace_branch_is_leaf(struct trace_iterator *iter, return true; } +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s) +{ + /* If duration disappear, we don't need anything */ + if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) + return 1; + + /* Non nested entry or return */ + if (duration == -1) + return trace_seq_printf(s, " "); + + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + } + + return trace_seq_printf(s, " "); +} + static enum print_line_t print_graph_irq(struct trace_seq *s, unsigned long addr, - enum trace_type type, int cpu, pid_t pid) + enum trace_type type, int cpu, pid_t pid) { int ret; @@ -242,35 +282,40 @@ print_graph_irq(struct trace_seq *s, unsigned long addr, addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; - if (type == TRACE_GRAPH_ENT) { - ret = trace_seq_printf(s, "==========> | "); - } else { - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + /* Cpu */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Proc */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + /* No overhead */ + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (type == TRACE_GRAPH_ENT) + ret = trace_seq_printf(s, "==========>"); + else + ret = trace_seq_printf(s, "<=========="); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Don't close the duration column if haven't one */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + trace_seq_printf(s, " |"); + ret = trace_seq_printf(s, "\n"); - ret = trace_seq_printf(s, "<========== |\n"); - } if (!ret) return TRACE_TYPE_PARTIAL_LINE; return TRACE_TYPE_HANDLED; @@ -289,7 +334,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) sprintf(msecs_str, "%lu", (unsigned long) duration); /* Print msecs */ - ret = trace_seq_printf(s, msecs_str); + ret = trace_seq_printf(s, "%s", msecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -322,19 +367,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) } -/* Signal a overhead of time execution to the output */ -static int -print_graph_overhead(unsigned long long duration, struct trace_seq *s) +static int print_graph_abs_time(u64 t, struct trace_seq *s) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - return trace_seq_printf(s, "! "); + unsigned long usecs_rem; - /* Duration exceeded 10 msecs */ - if (duration > 10000ULL) - return trace_seq_printf(s, "+ "); + usecs_rem = do_div(t, 1000000000); + usecs_rem /= 1000; - return trace_seq_printf(s, " "); + return trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); } /* Case of a leaf function on its call entry */ @@ -357,16 +398,16 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; /* Overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = print_graph_overhead(duration, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { @@ -395,25 +436,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ent *call = &entry->graph_ent; /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; - /* Interrupt */ - ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); - if (ret == TRACE_TYPE_UNHANDLED) { - /* No time */ + /* No time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - } else { - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; } - /* Function */ for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); @@ -434,15 +467,30 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter, int cpu) + struct trace_iterator *iter) { int ret; + int cpu = iter->cpu; + pid_t *last_entry = iter->private; struct trace_entry *ent = iter->ent; + struct ftrace_graph_ent *call = &field->graph_ent; /* Pid */ - if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + /* Interrupt */ + ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); @@ -470,16 +518,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, - struct trace_entry *ent, int cpu) + struct trace_entry *ent, struct trace_iterator *iter) { int i; int ret; + int cpu = iter->cpu; + pid_t *last_pid = iter->private; unsigned long long duration = trace->rettime - trace->calltime; /* Pid */ - if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); @@ -499,16 +556,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } /* Overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = print_graph_overhead(duration, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + ret = print_graph_overhead(duration, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; /* Duration */ - ret = print_graph_duration(duration, s); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Closing brace */ for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { @@ -542,14 +599,23 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, { int i; int ret; + int cpu = iter->cpu; + pid_t *last_pid = iter->private; + + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Pid */ - if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, iter->cpu); + ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; } @@ -566,17 +632,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, } /* No overhead */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - ret = trace_seq_printf(s, " "); + ret = print_graph_overhead(-1, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* No time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { + ret = trace_seq_printf(s, " | "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } - /* No time */ - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - /* Indentation */ if (trace->depth > 0) for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { @@ -614,13 +680,12 @@ print_graph_function(struct trace_iterator *iter) case TRACE_GRAPH_ENT: { struct ftrace_graph_ent_entry *field; trace_assign_type(field, entry); - return print_graph_entry(field, s, iter, - iter->cpu); + return print_graph_entry(field, s, iter); } case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter->cpu); + return print_graph_return(&field->ret, s, entry, iter); } case TRACE_PRINT: { struct print_entry *field; @@ -636,28 +701,55 @@ static void print_graph_headers(struct seq_file *s) { /* 1st line */ seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " TIME "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "CPU "); + seq_printf(s, "CPU"); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, "TASK/PID "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) - seq_printf(s, "OVERHEAD/"); - seq_printf(s, "DURATION FUNCTION CALLS\n"); + seq_printf(s, " TASK/PID "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " DURATION "); + seq_printf(s, " FUNCTION CALLS\n"); /* 2nd line */ seq_printf(s, "# "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " | "); if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, "| "); + seq_printf(s, "| "); if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, "| | "); - if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { - seq_printf(s, "| "); - seq_printf(s, "| | | | |\n"); - } else - seq_printf(s, " | | | | |\n"); + seq_printf(s, " | | "); + if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " | | "); + seq_printf(s, " | | | |\n"); } + +static void graph_trace_open(struct trace_iterator *iter) +{ + /* pid on the last trace processed */ + pid_t *last_pid = alloc_percpu(pid_t); + int cpu; + + if (!last_pid) + pr_warning("function graph tracer: not enough memory\n"); + else + for_each_possible_cpu(cpu) { + pid_t *pid = per_cpu_ptr(last_pid, cpu); + *pid = -1; + } + + iter->private = last_pid; +} + +static void graph_trace_close(struct trace_iterator *iter) +{ + percpu_free(iter->private); +} + static struct tracer graph_trace __read_mostly = { .name = "function_graph", + .open = graph_trace_open, + .close = graph_trace_close, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, -- cgit v1.2.3-59-g8ed1b From cc2f6d90e950b69ad31d483c19cc1d121bb25c16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 23 Jan 2009 13:03:37 -0800 Subject: kmemtrace: fix printk format warnings Fix kmemtrace printk warnings: kernel/trace/kmemtrace.c:142: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t' kernel/trace/kmemtrace.c:147: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t' Signed-off-by: Randy Dunlap Acked-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 7ebc58cee3bd..72b326bba005 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; /* Requested */ - ret = trace_seq_printf(s, "%4ld ", entry->bytes_req); + ret = trace_seq_printf(s, "%4zd ", entry->bytes_req); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Allocated */ - ret = trace_seq_printf(s, "%4ld ", entry->bytes_alloc); + ret = trace_seq_printf(s, "%4zd ", entry->bytes_alloc); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 23 Jan 2009 12:06:23 -0200 Subject: ftrace: add ftrace_vprintk Impact: new helper function Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 8 ++++++++ kernel/trace/trace.c | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9f7880d87c39..7840e718c6c7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -302,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); extern int __ftrace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap) +extern int +__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(void); #else static inline void @@ -317,6 +320,11 @@ ftrace_printk(const char *fmt, ...) { return 0; } +static inline int +ftrace_vprintk(const char *fmt, va_list ap) +{ + return 0; +} static inline void ftrace_dump(void) { } #endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2129ab9d2a48..2f8ac1f008f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) } EXPORT_SYMBOL_GPL(__ftrace_printk); +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { -- cgit v1.2.3-59-g8ed1b From c71a896154119f4ca9e89d6078f5f63ad60ef199 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 23 Jan 2009 12:06:27 -0200 Subject: blktrace: add ftrace plugin Impact: New way of using the blktrace infrastructure This drops the requirement of userspace utilities to use the blktrace facility. Configuration is done thru sysfs, adding a "trace" directory to the partition directory where blktrace can be enabled for the associated request_queue. The same filters present in the IOCTL interface are present as sysfs device attributes. The /sys/block/sdX/sdXN/trace/enable file allows tracing without any filters. The other files in this directory: pid, act_mask, start_lba and end_lba can be used with the same meaning as with the IOCTL interface. Using the sysfs interface will only setup the request_queue->blk_trace fields, tracing will only take place when the "blk" tracer is selected via the ftrace interface, as in the following example: To see the trace, one can use the /d/tracing/trace file or the /d/tracign/trace_pipe file, with semantics defined in the ftrace documentation in Documentation/ftrace.txt. [root@f10-1 ~]# cat /t/trace kjournald-305 [000] 3046.491224: 8,1 A WBS 6367 + 8 <- (8,1) 6304 kjournald-305 [000] 3046.491227: 8,1 Q R 6367 + 8 [kjournald] kjournald-305 [000] 3046.491236: 8,1 G RB 6367 + 8 [kjournald] kjournald-305 [000] 3046.491239: 8,1 P NS [kjournald] kjournald-305 [000] 3046.491242: 8,1 I RBS 6367 + 8 [kjournald] kjournald-305 [000] 3046.491251: 8,1 D WB 6367 + 8 [kjournald] kjournald-305 [000] 3046.491610: 8,1 U WS [kjournald] 1 -0 [000] 3046.511914: 8,1 C RS 6367 + 8 [6367] [root@f10-1 ~]# The default line context (prefix) format is the one described in the ftrace documentation, with the blktrace specific bits using its existing format, described in blkparse(8). If one wants to have the classic blktrace formatting, this is possible by using: [root@f10-1 ~]# echo blk_classic > /t/trace_options [root@f10-1 ~]# cat /t/trace 8,1 0 3046.491224 305 A WBS 6367 + 8 <- (8,1) 6304 8,1 0 3046.491227 305 Q R 6367 + 8 [kjournald] 8,1 0 3046.491236 305 G RB 6367 + 8 [kjournald] 8,1 0 3046.491239 305 P NS [kjournald] 8,1 0 3046.491242 305 I RBS 6367 + 8 [kjournald] 8,1 0 3046.491251 305 D WB 6367 + 8 [kjournald] 8,1 0 3046.491610 305 U WS [kjournald] 1 8,1 0 3046.511914 0 C RS 6367 + 8 [6367] [root@f10-1 ~]# Using the ftrace standard format allows more flexibility, such as the ability of asking for backtraces via trace_options: [root@f10-1 ~]# echo noblk_classic > /t/trace_options [root@f10-1 ~]# echo stacktrace > /t/trace_options [root@f10-1 ~]# cat /t/trace kjournald-305 [000] 3318.826779: 8,1 A WBS 6375 + 8 <- (8,1) 6312 kjournald-305 [000] 3318.826782: <= submit_bio <= submit_bh <= sync_dirty_buffer <= journal_commit_transaction <= kjournald <= kthread <= child_rip kjournald-305 [000] 3318.826836: 8,1 Q R 6375 + 8 [kjournald] kjournald-305 [000] 3318.826837: <= generic_make_request <= submit_bio <= submit_bh <= sync_dirty_buffer <= journal_commit_transaction <= kjournald <= kthread Please read the ftrace documentation to use aditional, standardized tracing filters such as /d/tracing/trace_cpumask, etc. See also /d/tracing/trace_mark to add comments in the trace stream, that is equivalent to the /d/block/sdaN/msg interface. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 651 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/partitions/check.c | 7 + kernel/trace/trace.h | 1 + 3 files changed, 654 insertions(+), 5 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index b0a2cae886db..630f167f8240 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -25,9 +25,27 @@ #include #include #include +#include <../kernel/trace/trace_output.h> static unsigned int blktrace_seq __read_mostly = 1; +static struct trace_array *blk_tr; +static int __read_mostly blk_tracer_enabled; + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + /* Global reference count of probes */ static DEFINE_MUTEX(blk_probe_mutex); static atomic_t blk_probes_ref = ATOMIC_INIT(0); @@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, { struct blk_io_trace *t; + if (!bt->rchan) + return; + t = relay_reserve(bt->rchan, sizeof(*t) + len); if (t) { const int cpu = smp_processor_id(); @@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) unsigned long flags; char *buf; + if (blk_tr) { + va_start(args, fmt); + ftrace_vprintk(fmt, args); + va_end(args); + return; + } + + if (!bt->msg_data) + return; + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); @@ -131,13 +162,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int rw, u32 what, int error, int pdu_len, void *pdu_data) { struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; struct blk_io_trace *t; unsigned long flags; unsigned long *sequence; pid_t pid; - int cpu; + int cpu, pc = 0; - if (unlikely(bt->trace_state != Blktrace_running)) + if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled)) return; what |= ddir_act[rw & WRITE]; @@ -150,6 +182,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid = tsk->pid; if (unlikely(act_log_check(bt, what, sector, pid))) return; + cpu = raw_smp_processor_id(); + + if (blk_tr) { + struct trace_entry *ent; + tracing_record_cmdline(current); + + event = ring_buffer_lock_reserve(blk_tr->buffer, + sizeof(*t) + pdu_len, &flags); + if (!event) + return; + + ent = ring_buffer_event_data(event); + t = (struct blk_io_trace *)ent; + pc = preempt_count(); + tracing_generic_entry_update(ent, 0, pc); + ent->type = TRACE_BLK; + goto record_it; + } /* * A word about the locking here - we disable interrupts to reserve @@ -163,23 +213,33 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { - cpu = smp_processor_id(); sequence = per_cpu_ptr(bt->sequence, cpu); t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); + t->cpu = cpu; + t->pid = pid; +record_it: t->sector = sector; t->bytes = bytes; t->action = what; - t->pid = pid; t->device = bt->dev; - t->cpu = cpu; t->error = error; t->pdu_len = pdu_len; if (pdu_len) memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tr) { + ring_buffer_unlock_commit(blk_tr->buffer, event, flags); + if (pid != 0 && + (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 && + (trace_flags & TRACE_ITER_STACKTRACE) != 0) + __trace_stack(blk_tr, NULL, flags, 5, pc); + trace_wake_up(); + return; + } } local_irq_restore(flags); @@ -888,3 +948,584 @@ static void blk_unregister_tracepoints(void) tracepoint_synchronize_unregister(); } + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + + if (t->action & BLK_TC_DISCARD) rwbs[i++] = 'D'; + else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W'; + else if (t->bytes) rwbs[i++] = 'R'; + else rwbs[i++] = 'N'; + + if (t->action & BLK_TC_AHEAD) rwbs[i++] = 'A'; + if (t->action & BLK_TC_BARRIER) rwbs[i++] = 'B'; + if (t->action & BLK_TC_SYNC) rwbs[i++] = 'S'; + if (t->action & BLK_TC_META) rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector = __r->sector; + + r->device = be32_to_cpu(__r->device); + r->device_from = be32_to_cpu(__r->device_from); + r->sector = be64_to_cpu(sector); +} + +static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + unsigned long long ts = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct trace_entry *ent = iter->ent; + const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + + fill_rwbs(rwbs, t); + + return trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, usec_rem, ent->pid, act, rwbs); +} + +static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, + const char *act) +{ + char rwbs[6]; + fill_rwbs(rwbs, t); + return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *cmd = trace_find_cmdline(ent->pid); + + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); +} + +static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) +{ + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); +} + +static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device = 0, }; + + get_pdu_remap(ent, &r); + return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), + t_sec(ent), MAJOR(r.device), MINOR(r.device), + (unsigned long long)r.sector); +} + +static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); +} + +static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), + get_pdu_int(ent)); +} + +static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), trace_find_cmdline(ent->pid)); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + int cpu; + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + if (blk_register_tracepoints()) + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled++; + mutex_unlock(&blk_probe_mutex); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + if (!atomic_read(&blk_probes_ref)) + return; + + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled--; + WARN_ON(blk_tracer_enabled < 0); + mutex_unlock(&blk_probe_mutex); + + blk_tracer_stop(tr); +} + +static struct { + const char *act[2]; + int (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] __read_mostly = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent, + int flags) +{ + const struct blk_io_trace *t = (struct blk_io_trace *)ent; + const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + int ret; + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(s, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(s, ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + const struct blk_io_trace *t; + u16 what; + int ret; + + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + t = (const struct blk_io_trace *)iter->ent; + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_iter(iter, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(&iter->seq, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .trace = blk_trace_event_print, + .latency_trace = blk_trace_event_print, + .raw = trace_nop_print, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + kfree(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->dev = dev; + bt->act_mask = (u16)-1; + bt->end_lba = -1ULL; + bt->trace_state = Blktrace_running; + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + kfree(bt); + ret = -EBUSY; + } + return 0; +err: + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_enable_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev != NULL) { + struct request_queue *q = bdev_get_queue(bdev); + + if (q != NULL) { + mutex_lock(&bdev->bd_mutex); + ret = sprintf(buf, "%u\n", !!q->blk_trace); + mutex_unlock(&bdev->bd_mutex); + } + + bdput(bdev); + } + + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + int value; + ssize_t ret = -ENXIO; + + if (count == 0 || sscanf(buf, "%d", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + mutex_unlock(&bdev->bd_mutex); + + if (ret == 0) + ret = count; +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, + sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static int blk_str2act_mask(const char *str) +{ + int mask = 0; + char *copy = kstrdup(str, GFP_KERNEL), *s; + + if (copy == NULL) + return -ENOMEM; + + s = strstrip(copy); + + while (1) { + char *sep = strchr(s, ','); + + if (sep != NULL) + *sep = '\0'; + + if (strcasecmp(s, "barrier") == 0) + mask |= BLK_TC_BARRIER; + else if (strcasecmp(s, "complete") == 0) + mask |= BLK_TC_COMPLETE; + else if (strcasecmp(s, "fs") == 0) + mask |= BLK_TC_FS; + else if (strcasecmp(s, "issue") == 0) + mask |= BLK_TC_ISSUE; + else if (strcasecmp(s, "pc") == 0) + mask |= BLK_TC_PC; + else if (strcasecmp(s, "queue") == 0) + mask |= BLK_TC_QUEUE; + else if (strcasecmp(s, "read") == 0) + mask |= BLK_TC_READ; + else if (strcasecmp(s, "requeue") == 0) + mask |= BLK_TC_REQUEUE; + else if (strcasecmp(s, "sync") == 0) + mask |= BLK_TC_SYNC; + else if (strcasecmp(s, "write") == 0) + mask |= BLK_TC_WRITE; + + if (sep == NULL) + break; + + s = sep + 1; + } + kfree(copy); + + return mask; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + mutex_lock(&bdev->bd_mutex); + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -ENXIO; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + value = blk_str2act_mask(buf); + if (value < 0) + goto out; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + ret = count; + } + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d720243f5f4..01714efdc65a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -268,6 +268,10 @@ ssize_t part_fail_store(struct device *dev, } #endif +#ifdef CONFIG_BLK_DEV_IO_TRACE +extern struct attribute_group blk_trace_attr_group; +#endif + static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); @@ -294,6 +298,9 @@ static struct attribute_group part_attr_group = { static struct attribute_group *part_attr_groups[] = { &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b96037d970df..e603a291134b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -32,6 +32,7 @@ enum trace_type { TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, + TRACE_BLK, __TRACE_LAST_TYPE, }; -- cgit v1.2.3-59-g8ed1b From 0ee2ad84d36760c8e36a6a6729998dc7a4eacf38 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 26 Jan 2009 06:45:10 -0800 Subject: sysrq: include interrupt.h instead of irq.h Impact: fix compile error on s390. With "cpumask: update irq_desc to use cpumask_var_t" in linux-next we get this compile bug on s390: CC drivers/char/sysrq.o In file included from drivers/char/sysrq.c:38: include/linux/irq.h: In function 'init_alloc_desc_masks': include/linux/irq.h:442: error: dereferencing pointer to incomplete type Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- drivers/char/sysrq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index d41b9f6f7903..59214dba206d 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3-59-g8ed1b From 157f9c00e88529ed84bd7d581a40d411e5414cf0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Jan 2009 15:00:56 -0200 Subject: tracing/blktrace: fix up checkpatch reported problems in ftrace plugin patch Also make sure sparse (make C=2 block/blktrace.o) is happy too. Reported-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 40 +++++++++++++++++++++++++--------------- fs/partitions/check.c | 5 +---- include/linux/blktrace_api.h | 5 +++++ 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 630f167f8240..1b2267c798b6 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -37,7 +37,7 @@ static int __read_mostly blk_tracer_enabled; static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ - { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) }, + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, { } }; @@ -169,7 +169,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; - if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled)) + if (unlikely(bt->trace_state != Blktrace_running || + !blk_tracer_enabled)) return; what |= ddir_act[rw & WRITE]; @@ -192,7 +193,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, sizeof(*t) + pdu_len, &flags); if (!event) return; - + ent = ring_buffer_event_data(event); t = (struct blk_io_trace *)ent; pc = preempt_count(); @@ -234,7 +235,7 @@ record_it: if (blk_tr) { ring_buffer_unlock_commit(blk_tr->buffer, event, flags); if (pid != 0 && - (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 && + !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) && (trace_flags & TRACE_ITER_STACKTRACE) != 0) __trace_stack(blk_tr, NULL, flags, 5, pc); trace_wake_up(); @@ -955,19 +956,27 @@ static void blk_unregister_tracepoints(void) static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { - int i = 0; + int i = 0; - if (t->action & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W'; - else if (t->bytes) rwbs[i++] = 'R'; - else rwbs[i++] = 'N'; + if (t->action & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (t->action & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; - if (t->action & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) rwbs[i++] = 'M'; + if (t->action & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (t->action & BLK_TC_BARRIER) + rwbs[i++] = 'B'; + if (t->action & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (t->action & BLK_TC_META) + rwbs[i++] = 'M'; - rwbs[i] = '\0'; + rwbs[i] = '\0'; } static inline @@ -1049,7 +1058,8 @@ static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) return trace_seq_printf(s, "[%s]\n", cmd); } -static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) +static int blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) { if (t_sec(ent)) return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 01714efdc65a..8a17f7edcc74 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "check.h" @@ -268,10 +269,6 @@ ssize_t part_fail_store(struct device *dev, } #endif -#ifdef CONFIG_BLK_DEV_IO_TRACE -extern struct attribute_group blk_trace_attr_group; -#endif - static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 1dba3493d520..59b4b2e8ab67 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -142,6 +142,9 @@ struct blk_user_trace_setup { #ifdef __KERNEL__ #if defined(CONFIG_BLK_DEV_IO_TRACE) + +#include + struct blk_trace { int trace_state; struct rchan *rchan; @@ -192,6 +195,8 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); +extern struct attribute_group blk_trace_attr_group; + #else /* !CONFIG_BLK_DEV_IO_TRACE */ #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) #define blk_trace_shutdown(q) do { } while (0) -- cgit v1.2.3-59-g8ed1b From bd0838fc48c174cd386447059a414318e78169e1 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 26 Jan 2009 18:08:47 -0800 Subject: x86: intel_cacheinfo: fix compiler warning fix the following warning: CC arch/x86/kernel/cpu/intel_cacheinfo.o arch/x86/kernel/cpu/intel_cacheinfo.c:314: warning: 'cpuid4_cache_lookup' defined but not used Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 58527a9fc404..51b5dfd67163 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -309,15 +309,6 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - static int __cpuinit find_num_cache_leaves(void) { unsigned int eax, ebx, ecx, edx; @@ -556,6 +547,15 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) per_cpu(cpuid4_info, cpu) = NULL; } +static int +__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +{ + struct _cpuid4_info_regs *leaf_regs = + (struct _cpuid4_info_regs *)this_leaf; + + return cpuid4_cache_lookup_regs(index, leaf_regs); +} + static void __cpuinit get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); -- cgit v1.2.3-59-g8ed1b From 32c0bd9624115041cfec31c0436995418083090a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Jan 2009 16:00:40 -0200 Subject: blktrace: the ftrace interface needs CONFIG_TRACING Impact: build fix Also mention in the help text that blktrace now can be used using the ftrace interface. Reported-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/Kconfig b/block/Kconfig index 0cbb3b88b59a..7cdaa1d72252 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -50,6 +50,8 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS + select TRACING + select STACKTRACE help Say Y here if you want to be able to trace the block layer actions on a given queue. Tracing allows you to see any traffic happening @@ -58,6 +60,12 @@ config BLK_DEV_IO_TRACE git://git.kernel.dk/blktrace.git + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + If unsure, say N. config BLK_DEV_BSG -- cgit v1.2.3-59-g8ed1b From 890252823766e562301e61340f3187a14033d045 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 26 Jan 2009 18:28:02 +0300 Subject: x86: ftrace - simplify wait_for_nmi Get rid of 'waited' stack variable. Signed-off-by: Cyrill Gorcunov Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b43086b097a..4d33224c055f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -133,15 +133,14 @@ void ftrace_nmi_exit(void) static void wait_for_nmi(void) { - int waited = 0; + if (!atomic_read(&in_nmi)) + return; - while (atomic_read(&in_nmi)) { - waited = 1; + do { cpu_relax(); - } + } while(atomic_read(&in_nmi)); - if (waited) - nmi_wait_count++; + nmi_wait_count++; } static int -- cgit v1.2.3-59-g8ed1b From 5e1065726e0350097d8fe18dc2fcf86516a0a1f2 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Thu, 22 Jan 2009 21:34:05 -0600 Subject: [XFS] Warn on transaction in flight on read-only remount Till VFS can correctly support read-only remount without racing, use WARN_ON instead of BUG_ON on detecting transaction in flight after quiescing filesystem. Signed-off-by: Felix Blyakher Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_sync.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 2ed035354c26..a608e72fa405 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -371,7 +371,11 @@ xfs_quiesce_attr( /* flush inodes and push all remaining buffers out to disk */ xfs_quiesce_fs(mp); - ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp, 1); -- cgit v1.2.3-59-g8ed1b From ac12b4e25eedf855f277741d5ac0c3c88981a703 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 25 Jan 2009 20:53:00 -0600 Subject: don't reallocate sxp variable passed into xfs_swapext fixes kernel.org bugzilla 12538, xfs_fsr fails on 2.6.29-rc kernels Regression caused by 743bb4650da9e2595d6cedd01c680b5b9398c74a This was an embarrasing mistake, reallocating the sxp pointer passed in from the main ioctl switch. Signed-off-by: Eric Sandeen Tested-by: Paul Martin Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index b4c1ee713492..f8278cfcc1d3 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -55,17 +55,11 @@ xfs_swapext( struct file *file, *target_file; int error = 0; - sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL); - if (!sxp) { - error = XFS_ERROR(ENOMEM); - goto out; - } - /* Pull information for the target fd */ file = fget((int)sxp->sx_fdtarget); if (!file) { error = XFS_ERROR(EINVAL); - goto out_free_sxp; + goto out; } if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) { @@ -109,8 +103,6 @@ xfs_swapext( fput(target_file); out_put_file: fput(file); - out_free_sxp: - kmem_free(sxp); out: return error; } -- cgit v1.2.3-59-g8ed1b From 700a3dcb9036d17d3a67d0a7ceee9d4373fbb570 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 28 Jan 2009 12:33:56 -0200 Subject: blktrace: Use tracing_reset_online_cpus Impact: cleanup Use tracing_reset_online_cpus instead of open coding it. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 1b2267c798b6..04d81d31fd94 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1109,12 +1109,7 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - int cpu; - - tr->time_start = ftrace_now(tr->cpu); - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); + tracing_reset_online_cpus(tr); mutex_lock(&blk_probe_mutex); if (atomic_add_return(1, &blk_probes_ref) == 1) -- cgit v1.2.3-59-g8ed1b From f04109bf1be7449e27d38ae1bb8465013374bd49 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 28 Jan 2009 13:02:12 -0200 Subject: trace: Use tracing_reset_online_cpus in more places MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frédéric Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 6 +----- kernel/trace/trace_functions_graph.c | 8 ++------ kernel/trace/trace_nop.c | 6 +----- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ca017e0a9a27..1284145c8898 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -133,11 +133,7 @@ static void stop_branch_trace(struct trace_array *tr) static int branch_trace_init(struct trace_array *tr) { - int cpu; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - + tracing_reset_online_cpus(tr); start_branch_trace(tr); return 0; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 66fc7b806a38..c97594d826bc 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -52,15 +52,11 @@ static struct tracer_flags tracer_flags = { static int graph_trace_init(struct trace_array *tr) { - int cpu, ret; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - - ret = register_ftrace_graph(&trace_graph_return, + int ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); if (ret) return ret; + tracing_reset_online_cpus(tr); tracing_start_cmdline_record(); return 0; diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index b9767acd30ac..087b6cbf4ea5 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -47,12 +47,8 @@ static void stop_nop_trace(struct trace_array *tr) static int nop_trace_init(struct trace_array *tr) { - int cpu; ctx_trace = tr; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - + tracing_reset_online_cpus(tr); start_nop_trace(tr); return 0; } -- cgit v1.2.3-59-g8ed1b From b3a8c34886d0e3dd3a24a5b614ee025181da2f41 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 28 Jan 2009 13:08:37 -0200 Subject: trace_sched_wakeup: Remove unused variable Impact: cleanup Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/trace_sched_wakeup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 93cecda650b2..a48c9b4b0c85 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -184,13 +184,10 @@ out: static void __wakeup_reset(struct trace_array *tr) { - struct trace_array_cpu *data; int cpu; - for_each_possible_cpu(cpu) { - data = tr->data[cpu]; + for_each_possible_cpu(cpu) tracing_reset(tr, cpu); - } wakeup_cpu = -1; wakeup_prio = -1; -- cgit v1.2.3-59-g8ed1b From ecf441b593ac41cb8cd8cd3695110167c42e098c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 29 Jan 2009 13:49:45 -0800 Subject: kmemtrace: fix printk formats, fix Geert Uytterhoeven wrote: > %4zu? Reported-by: Geert Uytterhoeven Signed-off-by: Randy Dunlap Acked-by: Eduard - Gabriel Munteanu Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 72b326bba005..f04c0625f1cd 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; /* Requested */ - ret = trace_seq_printf(s, "%4zd ", entry->bytes_req); + ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); if (!ret) return TRACE_TYPE_PARTIAL_LINE; /* Allocated */ - ret = trace_seq_printf(s, "%4zd ", entry->bytes_alloc); + ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From 5e54f5986a579b8445aa1d5ad3435c2cf7568bed Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 30 Jan 2009 15:29:54 -0800 Subject: softlockup: remove unused definition for spawn_softlockup_task The definition of spawn_softlockup_task in sched.h became unnecessary once it was converted to the early_initcall() interface. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f2f94d532302..2a2811c6239d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -302,9 +302,6 @@ extern int softlockup_thresh; static inline void softlockup_tick(void) { } -static inline void spawn_softlockup_task(void) -{ -} static inline void touch_softlockup_watchdog(void) { } -- cgit v1.2.3-59-g8ed1b From b2821ae68b14480bfc85ea1629537163310bc5cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Feb 2009 21:38:32 -0500 Subject: trace: fix default boot up tracer Peter Zijlstra started the functionality to start up a default tracing at bootup. This patch finishes the work. Now if you add 'ftrace=' to the command line, when that tracer is registered on bootup, that tracer is selected and starts tracing. Note, all selftests for tracers that are registered after this tracer is disabled. This prevents the selftests from disturbing the running tracer, or the running tracer from disturbing the selftest. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2f8ac1f008f5..2c720c79bc60 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -53,6 +53,11 @@ unsigned long __read_mostly tracing_thresh; */ static bool __read_mostly tracing_selftest_running; +/* + * If a tracer is running, we do not want to run SELFTEST. + */ +static bool __read_mostly tracing_selftest_disabled; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -110,14 +115,19 @@ static cpumask_var_t __read_mostly tracing_buffer_mask; */ int ftrace_dump_on_oops; -static int tracing_set_tracer(char *buf); +static int tracing_set_tracer(const char *buf); + +#define BOOTUP_TRACER_SIZE 100 +static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; +static char *default_bootup_tracer; static int __init set_ftrace(char *str) { - tracing_set_tracer(str); + strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); + default_bootup_tracer = bootup_tracer_buf; return 1; } -__setup("ftrace", set_ftrace); +__setup("ftrace=", set_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { @@ -468,7 +478,7 @@ int register_tracer(struct tracer *type) type->flags->opts = dummy_tracer_opt; #ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest) { + if (type->selftest && !tracing_selftest_disabled) { struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; int i; @@ -510,8 +520,25 @@ int register_tracer(struct tracer *type) out: tracing_selftest_running = false; mutex_unlock(&trace_types_lock); - lock_kernel(); + if (!ret && default_bootup_tracer) { + if (!strncmp(default_bootup_tracer, type->name, + BOOTUP_TRACER_SIZE)) { + printk(KERN_INFO "Starting tracer '%s'\n", + type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(type->name); + default_bootup_tracer = NULL; + /* disable other selftests, since this will break it. */ + tracing_selftest_disabled = 1; +#ifdef CONFIG_FTRACE_STARTUP_TEST + printk(KERN_INFO "Disabling FTRACE selftests due" + " to running tracer '%s'\n", type->name); +#endif + } + } + + lock_kernel(); return ret; } @@ -2245,7 +2272,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static int tracing_set_tracer(char *buf) +static int tracing_set_tracer(const char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; @@ -3163,5 +3190,26 @@ out_free_buffer_mask: out: return ret; } + +__init static int clear_boot_tracer(void) +{ + /* + * The default tracer at boot buffer is an init section. + * This function is called in lateinit. If we did not + * find the boot tracer, then clear it out, to prevent + * later registration from accessing the buffer that is + * about to be freed. + */ + if (!default_bootup_tracer) + return 0; + + printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", + default_bootup_tracer); + default_bootup_tracer = NULL; + + return 0; +} + early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); +late_initcall(clear_boot_tracer); -- cgit v1.2.3-59-g8ed1b From 79fb0768fbd371f3b94d909f51f587b3a24ab272 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Feb 2009 21:38:33 -0500 Subject: trace: let boot trace be chosen by command line Now that we have a working ftrace= function, make the boot tracer get activated by it. This way we can turn it on or off without recompiling the kernel, as well as keeping the selftests on. The selftests are disabled whenever a default tracer starts running. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 7 +++---- kernel/trace/trace.c | 5 +---- kernel/trace/trace_boot.c | 11 +++++++---- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index dde1d46f77e5..28f2644484d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,9 +164,8 @@ config BOOT_TRACER representation of the delays during initcalls - but the raw /debug/tracing/trace text output is readable too. - ( Note that tracing self tests can't be enabled if this tracer is - selected, because the self-tests are an initcall as well and that - would invalidate the boot trace. ) + You must pass in ftrace=initcall to the kernel command line + to enable this on bootup. config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" @@ -326,7 +325,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER + depends on TRACING && DEBUG_KERNEL select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2c720c79bc60..40edef4255c5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3167,12 +3167,9 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); register_tracer(&nop_trace); + current_trace = &nop_trace; #ifdef CONFIG_BOOT_TRACER register_tracer(&boot_tracer); - current_trace = &boot_tracer; - current_trace->init(&global_trace); -#else - current_trace = &nop_trace; #endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 0e94b3d091f7..1f07895977a0 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -28,13 +28,13 @@ void start_boot_trace(void) void enable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_start_sched_switch_record(); } void disable_boot_trace(void) { - if (pre_initcalls_finished) + if (boot_trace && pre_initcalls_finished) tracing_stop_sched_switch_record(); } @@ -43,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; + if (!tr) + return 0; + for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); @@ -132,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; /* Get its name now since this function could @@ -164,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!pre_initcalls_finished) + if (!tr || !pre_initcalls_finished) return; sprint_symbol(bt->func, (unsigned long)fn); -- cgit v1.2.3-59-g8ed1b From c4a8e8be2d43cc22b371e8e9c05c253409759d94 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 2 Feb 2009 20:29:21 -0200 Subject: trace: better manage the context info for events Impact: make trace_event more convenient for tracers All tracers (for the moment) that use the struct trace_event want to have the context info printed before their own output: the pid/cmdline, cpu, and timestamp. But some other tracers that want to implement their trace_event callbacks will not necessary need these information or they may want to format them as they want. This patch adds a new default-enabled trace option: TRACE_ITER_CONTEXT_INFO When disabled through: echo nocontext-info > /debugfs/tracing/trace_options The pid, cpu and timestamps headers will not be printed. IE with the sched_switch tracer with context-info (default): bash-2935 [001] 100.356561: 2935:120:S ==> [001] 0:140:R -0 [000] 100.412804: 0:140:R + [000] 11:115:S events/0 -0 [000] 100.412816: 0:140:R ==> [000] 11:115:R events/0 events/0-11 [000] 100.412829: 11:115:S ==> [000] 0:140:R Without context-info: 2935:120:S ==> [001] 0:140:R 0:140:R + [000] 11:115:S events/0 0:140:R ==> [000] 11:115:R events/0 11:115:S ==> [000] 0:140:R A tracer can disable it at runtime by clearing the bit TRACE_ITER_CONTEXT_INFO in trace_flags. The print routines were renamed to trace_print_context and trace_print_lat_context, so that they can be used by tracers if they want to use them for one of the trace_event callbacks. Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 149 +++++++++++--------------------------------- kernel/trace/trace.h | 7 ++- kernel/trace/trace_output.c | 107 +++++++++++++++++++++++++++++++ kernel/trace/trace_output.h | 3 + 4 files changed, 151 insertions(+), 115 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2f8ac1f008f5..5ec49c3c1597 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -227,7 +227,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO; /** * trace_wake_up - wake up tasks waiting for trace input @@ -285,6 +285,7 @@ static const char *trace_options[] = { "userstacktrace", "sym-userobj", "printk-msg-only", + "context-info", NULL }; @@ -1171,8 +1172,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) } /* Find the next real entry, without updating the iterator itself */ -static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts) { return __find_next_entry(iter, ent_cpu, ent_ts); } @@ -1351,57 +1352,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) seq_puts(m, "\n"); } -static void -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) -{ - int hardirq, softirq; - char *comm; - - comm = trace_find_cmdline(entry->pid); - - trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%3d", cpu); - trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; - softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) { - trace_seq_putc(s, 'H'); - } else { - if (hardirq) { - trace_seq_putc(s, 'h'); - } else { - if (softirq) - trace_seq_putc(s, 's'); - else - trace_seq_putc(s, '.'); - } - } - - if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); - else - trace_seq_puts(s, "."); -} - -unsigned long preempt_mark_thresh = 100; - -static void -lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, - unsigned long rel_usecs) -{ - trace_seq_printf(s, " %4lldus", abs_usecs); - if (rel_usecs > preempt_mark_thresh) - trace_seq_puts(s, "!: "); - else if (rel_usecs > 1) - trace_seq_puts(s, "+: "); - else - trace_seq_puts(s, " : "); -} - static void test_cpu_buff_start(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1419,46 +1369,24 @@ static void test_cpu_buff_start(struct trace_iterator *iter) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } -static enum print_line_t -print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) +static enum print_line_t print_lat_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry; struct trace_event *event; - unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; - unsigned long abs_usecs; - unsigned long rel_usecs; - u64 next_ts; - char *comm; int ret; test_cpu_buff_start(iter); - next_entry = find_next_entry(iter, NULL, &next_ts); - if (!next_entry) - next_ts = iter->ts; - rel_usecs = ns2usecs(next_ts - iter->ts); - abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); - - if (verbose) { - comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, cpu, entry->flags, - entry->preempt_count, trace_idx, - ns2usecs(iter->ts), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, - rel_usecs % 1000); - } else { - lat_print_generic(s, entry, cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); + event = ftrace_find_event(entry->type); + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + ret = trace_print_lat_context(iter); + if (ret) + return ret; } - event = ftrace_find_event(entry->type); if (event && event->latency_trace) { ret = event->latency_trace(s, entry, sym_flags); if (ret) @@ -1476,33 +1404,20 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; - unsigned long usec_rem; - unsigned long long t; - unsigned long secs; - char *comm; int ret; entry = iter->ent; test_cpu_buff_start(iter); - comm = trace_find_cmdline(iter->ent->pid); - - t = ns2usecs(iter->ts); - usec_rem = do_div(t, 1000000ULL); - secs = (unsigned long)t; + event = ftrace_find_event(entry->type); - ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "[%03d] ", iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + ret = trace_print_context(iter); + if (ret) + return ret; + } - event = ftrace_find_event(entry->type); if (event && event->trace) { ret = event->trace(s, entry, sym_flags); if (ret) @@ -1525,10 +1440,12 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + ret = trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } event = ftrace_find_event(entry->type); if (event && event->raw) { @@ -1553,9 +1470,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + } event = ftrace_find_event(entry->type); if (event && event->hex) @@ -1575,7 +1494,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, field->buf); + ret = trace_seq_printf(s, "%s", field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1590,9 +1509,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, iter->ts); + } event = ftrace_find_event(entry->type); if (event && event->binary) @@ -1643,7 +1564,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return print_raw_fmt(iter); if (iter->iter_flags & TRACE_FILE_LAT_FMT) - return print_lat_fmt(iter, iter->idx, iter->cpu); + return print_lat_fmt(iter); return print_trace_fmt(iter); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e603a291134b..f0c7a0f08cac 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -405,6 +405,10 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); + +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); @@ -591,7 +595,8 @@ enum trace_iterator_flags { TRACE_ITER_ANNOTATE = 0x2000, TRACE_ITER_USERSTACKTRACE = 0x4000, TRACE_ITER_SYM_USEROBJ = 0x8000, - TRACE_ITER_PRINTK_MSGONLY = 0x10000 + TRACE_ITER_PRINTK_MSGONLY = 0x10000, + TRACE_ITER_CONTEXT_INFO = 0x20000 /* Print pid/cpu/time */ }; /* diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1a4e144a9f8f..a5752d4d3c33 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -286,6 +286,113 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } +static void +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + int hardirq, softirq; + char *comm; + + comm = trace_find_cmdline(entry->pid); + + trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); + trace_seq_printf(s, "%3d", cpu); + trace_seq_printf(s, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', + ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) { + trace_seq_putc(s, 'H'); + } else { + if (hardirq) { + trace_seq_putc(s, 'h'); + } else { + if (softirq) + trace_seq_putc(s, 's'); + else + trace_seq_putc(s, '.'); + } + } + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_puts(s, "."); +} + +static unsigned long preempt_mark_thresh = 100; + +static void +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, + unsigned long rel_usecs) +{ + trace_seq_printf(s, " %4lldus", abs_usecs); + if (rel_usecs > preempt_mark_thresh) + trace_seq_puts(s, "!: "); + else if (rel_usecs > 1) + trace_seq_puts(s, "+: "); + else + trace_seq_puts(s, " : "); +} + +int trace_print_context(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + char *comm = trace_find_cmdline(entry->pid); + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned long secs = (unsigned long)t; + + if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid)) + goto partial; + if (!trace_seq_printf(s, "[%03d] ", entry->cpu)) + goto partial; + if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem)) + goto partial; + + return 0; + +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +int trace_print_lat_context(struct trace_iterator *iter) +{ + u64 next_ts; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent, + *next_entry = trace_find_next_entry(iter, NULL, + &next_ts); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); + unsigned long rel_usecs; + + if (!next_entry) + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + + if (verbose) { + char *comm = trace_find_cmdline(entry->pid); + trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", + comm, + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, iter->idx, + ns2usecs(iter->ts), + abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, + rel_usecs % 1000); + } else { + lat_print_generic(s, entry, entry->cpu); + lat_print_timestamp(s, abs_usecs, rel_usecs); + } + + return 0; +} + static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; static int task_state_char(unsigned long state) diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 1cbab5e3dc99..ec2ed90f10f0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -33,6 +33,9 @@ int seq_print_userip_objs(const struct userstack_entry *entry, int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, unsigned long ip, unsigned long sym_flags); +int trace_print_context(struct trace_iterator *iter); +int trace_print_lat_context(struct trace_iterator *iter); + struct trace_event *ftrace_find_event(int type); int register_ftrace_event(struct trace_event *event); int unregister_ftrace_event(struct trace_event *event); -- cgit v1.2.3-59-g8ed1b From 2c9b238eb325895d3312dad64e2685783575e474 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 2 Feb 2009 20:30:12 -0200 Subject: trace: Change struct trace_event callbacks parameter list Impact: API change The trace_seq and trace_entry are in trace_iterator, where there are more fields that may be needed by tracers, so just pass the tracer_iterator as is already the case for struct tracer->print_line. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 8 +-- kernel/trace/trace.c | 10 ++-- kernel/trace/trace_branch.c | 7 +-- kernel/trace/trace_output.c | 139 ++++++++++++++++++++------------------------ kernel/trace/trace_output.h | 6 +- 5 files changed, 76 insertions(+), 94 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 3f25425ade12..570cd3c40bd1 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1140,10 +1140,10 @@ static struct { [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; -static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent, - int flags) +static int blk_trace_event_print(struct trace_iterator *iter, int flags) { - const struct blk_io_trace *t = (struct blk_io_trace *)ent; + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); int ret; @@ -1153,7 +1153,7 @@ static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent, const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); if (ret) - ret = what2act[what].print(s, ent); + ret = what2act[what].print(s, iter->ent); } return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ec49c3c1597..152d0969adf8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1388,7 +1388,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter) } if (event && event->latency_trace) { - ret = event->latency_trace(s, entry, sym_flags); + ret = event->latency_trace(iter, sym_flags); if (ret) return ret; return TRACE_TYPE_HANDLED; @@ -1419,7 +1419,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) } if (event && event->trace) { - ret = event->trace(s, entry, sym_flags); + ret = event->trace(iter, sym_flags); if (ret) return ret; return TRACE_TYPE_HANDLED; @@ -1449,7 +1449,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event && event->raw) { - ret = event->raw(s, entry, 0); + ret = event->raw(iter, 0); if (ret) return ret; return TRACE_TYPE_HANDLED; @@ -1478,7 +1478,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event && event->hex) - event->hex(s, entry, 0); + event->hex(iter, 0); SEQ_PUT_FIELD_RET(s, newline); @@ -1517,7 +1517,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event && event->binary) - event->binary(s, entry, 0); + event->binary(iter, 0); return TRACE_TYPE_HANDLED; } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 1284145c8898..ea62f101e615 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -160,14 +160,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int -trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_branch_print(struct trace_iterator *iter, int flags) { struct trace_branch *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); - if (trace_seq_printf(s, "[%s] %s:%s:%d\n", + if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", field->correct ? " ok " : " MISS ", field->func, field->file, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a5752d4d3c33..c24503b281a0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -484,19 +484,18 @@ int unregister_ftrace_event(struct trace_event *event) * Standard events */ -int -trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags) +int trace_nop_print(struct trace_iterator *iter, int flags) { return 0; } /* TRACE_FN */ -static int -trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_fn_latency(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!seq_print_ip_sym(s, field->ip, flags)) goto partial; @@ -513,12 +512,12 @@ trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int -trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_fn_trace(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!seq_print_ip_sym(s, field->ip, flags)) goto partial; @@ -540,14 +539,13 @@ trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int -trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_fn_raw(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, "%lx %lx\n", + if (!trace_seq_printf(&iter->seq, "%lx %lx\n", field->ip, field->parent_ip)) return TRACE_TYPE_PARTIAL_LINE; @@ -555,12 +553,12 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags) return 0; } -static int -trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_fn_hex(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); SEQ_PUT_HEX_FIELD_RET(s, field->ip); SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); @@ -568,12 +566,12 @@ trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags) return 0; } -static int -trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_fn_bin(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); SEQ_PUT_FIELD_RET(s, field->ip); SEQ_PUT_FIELD_RET(s, field->parent_ip); @@ -591,20 +589,19 @@ static struct trace_event trace_fn_event = { }; /* TRACE_CTX an TRACE_WAKE */ -static int -trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags, - char *delim) +static int trace_ctxwake_print(struct trace_iterator *iter, char *delim) { struct ctx_switch_entry *field; char *comm; int S, T; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); T = task_state_char(field->next_state); S = task_state_char(field->prev_state); comm = trace_find_cmdline(field->next_pid); - if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + if (!trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, field->prev_prio, S, delim, @@ -617,31 +614,27 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags, return 0; } -static int -trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_ctx_print(struct trace_iterator *iter, int flags) { - return trace_ctxwake_print(s, entry, flags, "==>"); + return trace_ctxwake_print(iter, "==>"); } -static int -trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_wake_print(struct trace_iterator *iter, int flags) { - return trace_ctxwake_print(s, entry, flags, " +"); + return trace_ctxwake_print(iter, " +"); } -static int -trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags, - char S) +static int trace_ctxwake_raw(struct trace_iterator *iter, char S) { struct ctx_switch_entry *field; int T; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!S) task_state_char(field->prev_state); T = task_state_char(field->next_state); - if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n", + if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, field->prev_prio, S, @@ -654,27 +647,24 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags, return 0; } -static int -trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_ctx_raw(struct trace_iterator *iter, int flags) { - return trace_ctxwake_raw(s, entry, flags, 0); + return trace_ctxwake_raw(iter, 0); } -static int -trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_wake_raw(struct trace_iterator *iter, int flags) { - return trace_ctxwake_raw(s, entry, flags, '+'); + return trace_ctxwake_raw(iter, '+'); } -static int -trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags, - char S) +static int trace_ctxwake_hex(struct trace_iterator *iter, char S) { struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; int T; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!S) task_state_char(field->prev_state); @@ -691,24 +681,22 @@ trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags, return 0; } -static int -trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_ctx_hex(struct trace_iterator *iter, int flags) { - return trace_ctxwake_hex(s, entry, flags, 0); + return trace_ctxwake_hex(iter, 0); } -static int -trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_wake_hex(struct trace_iterator *iter, int flags) { - return trace_ctxwake_hex(s, entry, flags, '+'); + return trace_ctxwake_hex(iter, '+'); } -static int -trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_ctxwake_bin(struct trace_iterator *iter, int flags) { struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); SEQ_PUT_FIELD_RET(s, field->prev_pid); SEQ_PUT_FIELD_RET(s, field->prev_prio); @@ -739,14 +727,13 @@ static struct trace_event trace_wake_event = { }; /* TRACE_SPECIAL */ -static int -trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_special_print(struct trace_iterator *iter, int flags) { struct special_entry *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, "# %ld %ld %ld\n", + if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", field->arg1, field->arg2, field->arg3)) @@ -755,12 +742,12 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags) return 0; } -static int -trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_special_hex(struct trace_iterator *iter, int flags) { struct special_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); SEQ_PUT_HEX_FIELD_RET(s, field->arg1); SEQ_PUT_HEX_FIELD_RET(s, field->arg2); @@ -769,12 +756,12 @@ trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags) return 0; } -static int -trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_special_bin(struct trace_iterator *iter, int flags) { struct special_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); SEQ_PUT_FIELD_RET(s, field->arg1); SEQ_PUT_FIELD_RET(s, field->arg2); @@ -794,13 +781,13 @@ static struct trace_event trace_special_event = { /* TRACE_STACK */ -static int -trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_stack_print(struct trace_iterator *iter, int flags) { struct stack_entry *field; + struct trace_seq *s = &iter->seq; int i; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { @@ -830,13 +817,12 @@ static struct trace_event trace_stack_event = { }; /* TRACE_USER_STACK */ -static int -trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry, - int flags) +static int trace_user_stack_print(struct trace_iterator *iter, int flags) { struct userstack_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!seq_print_userip_objs(field, s, flags)) goto partial; @@ -860,12 +846,12 @@ static struct trace_event trace_user_stack_event = { }; /* TRACE_PRINT */ -static int -trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_print_print(struct trace_iterator *iter, int flags) { struct print_entry *field; + struct trace_seq *s = &iter->seq; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!seq_print_ip_sym(s, field->ip, flags)) goto partial; @@ -879,14 +865,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int -trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags) +static int trace_print_raw(struct trace_iterator *iter, int flags) { struct print_entry *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf)) + if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) goto partial; return 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index ec2ed90f10f0..3aeb31f6506b 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -3,8 +3,7 @@ #include "trace.h" -typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry, - int flags); +typedef int (*trace_print_func)(struct trace_iterator *iter, int flags); struct trace_event { struct hlist_node node; @@ -40,8 +39,7 @@ struct trace_event *ftrace_find_event(int type); int register_ftrace_event(struct trace_event *event); int unregister_ftrace_event(struct trace_event *event); -int -trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags); +int trace_nop_print(struct trace_iterator *iter, int flags); #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) -- cgit v1.2.3-59-g8ed1b From 08a06b83ff8b2779289f733348c669f31cb65d51 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 2 Feb 2009 20:30:40 -0200 Subject: blkftrace: binary tracing, synthesizing old format Impact: new feature With this and a blkrawverify modified not to verify the sequence numbers we can start using the userspace tools to verify that the data produced with the ftrace plugin works as expected. Example: [root@f10-1 ~]# echo 1 > /sys/block/sda/sda1/trace/enable [root@f10-1 ~]# echo bin > /d/tracing/trace_options [root@f10-1 ~]# echo blk > /d/tracing/current_tracer [root@f10-1 ~]# cat /d/tracing/trace_pipe > sda1.blktrace.0 ^C [root@f10-1 ~]# ./blkrawverify --noseq sda1 Verifying sda1 CPU 0 Wrote output to sda1.verify.out [root@f10-1 ~]# cat sda1.verify.out --------------- Verifying sda1 --------------------- Summary for cpu 0: 1349 valid + 0 invalid (100.0%) processed [root@f10-1 ~]# Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 570cd3c40bd1..4f45b343690a 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -219,9 +219,16 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); - t->cpu = cpu; - t->pid = pid; record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + t->sector = sector; t->bytes = bytes; t->action = what; @@ -1086,6 +1093,7 @@ static void blk_tracer_start(struct trace_array *tr) if (blk_register_tracepoints()) atomic_dec(&blk_probes_ref); mutex_unlock(&blk_probe_mutex); + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1100,6 +1108,7 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { + trace_flags |= TRACE_ITER_CONTEXT_INFO; mutex_lock(&blk_probe_mutex); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); @@ -1147,6 +1156,9 @@ static int blk_trace_event_print(struct trace_iterator *iter, int flags) const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); int ret; + if (trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) ret = trace_seq_printf(s, "Bad pc action %x\n", what); else { @@ -1159,6 +1171,28 @@ static int blk_trace_event_print(struct trace_iterator *iter, int flags) return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } +static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = ns2usecs(iter->ts), + }; + + if (!trace_seq_putmem(s, &old, offset)) + return 0; + return trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static int blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +{ + return blk_trace_synthesize_old_trace(iter) ? + TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) { const struct blk_io_trace *t; @@ -1200,7 +1234,7 @@ static struct trace_event trace_blk_event = { .latency_trace = blk_trace_event_print, .raw = trace_nop_print, .hex = trace_nop_print, - .binary = trace_nop_print, + .binary = blk_trace_event_print_binary, }; static int __init init_blk_tracer(void) -- cgit v1.2.3-59-g8ed1b From 939b366977d29b5c0d53d1ea3b0b8cefb1e76202 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Feb 2009 11:58:29 -0200 Subject: blktrace: fix coding style in recent patches Impact: cleanup Reported-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- block/blktrace.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 4f45b343690a..8f5c37b0f80f 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include <../kernel/trace/trace_output.h> static unsigned int blktrace_seq __read_mostly = 1; @@ -148,11 +148,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, /* * Data direction bit lookup */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) ) +#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -221,13 +222,13 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t->time = ktime_to_ns(ktime_get()); record_it: /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; t->sector = sector; t->bytes = bytes; @@ -453,7 +454,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, atomic_set(&bt->dropped, 0); ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); if (!bt->dropped_file) goto err; @@ -535,10 +537,10 @@ EXPORT_SYMBOL_GPL(blk_trace_setup); int blk_trace_startstop(struct request_queue *q, int start) { - struct blk_trace *bt; int ret; + struct blk_trace *bt = q->blk_trace; - if ((bt = q->blk_trace) == NULL) + if (bt == NULL) return -EINVAL; /* @@ -674,12 +676,14 @@ static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } -static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_requeue(struct request_queue *q, + struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_complete(struct request_queue *q, + struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -716,12 +720,14 @@ static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_backmerge(struct request_queue *q, + struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); } -static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_frontmerge(struct request_queue *q, + struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); } @@ -731,7 +737,8 @@ static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) blk_add_trace_bio(q, bio, BLK_TA_QUEUE); } -static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) +static void blk_add_trace_getrq(struct request_queue *q, + struct bio *bio, int rw) { if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ); @@ -744,7 +751,8 @@ static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw } -static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) +static void blk_add_trace_sleeprq(struct request_queue *q, + struct bio *bio, int rw) { if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); @@ -752,7 +760,8 @@ static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); } } -- cgit v1.2.3-59-g8ed1b From 3228149ceb8b045e324cd268be9182bb26e6488b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 22 Jan 2009 15:37:47 +1100 Subject: xfs: Check buffer lengths in log recovery Before trying to obtain, read or write a buffer, check that the buffer length is actually valid. If it is not valid, then something read in the recovery process has been corrupted and we should abort recovery. Reported-by: Eric Sesterhenn Tested-by: Eric Sesterhenn Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Dave Chinner Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 35cca98bd94c..b1047de2fffd 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -70,16 +70,21 @@ STATIC void xlog_recover_check_summary(xlog_t *); xfs_buf_t * xlog_get_bp( xlog_t *log, - int num_bblks) + int nbblks) { - ASSERT(num_bblks > 0); + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_get_bp(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; + } if (log->l_sectbb_log) { - if (num_bblks > 1) - num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); + if (nbblks > 1) + nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); } - return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); + return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); } void @@ -102,6 +107,13 @@ xlog_bread( { int error; + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_bread(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + if (log->l_sectbb_log) { blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); @@ -139,6 +151,13 @@ xlog_bwrite( { int error; + if (nbblks <= 0 || nbblks > log->l_logBBsize) { + xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); + XFS_ERROR_REPORT("xlog_bwrite(1)", + XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; + } + if (log->l_sectbb_log) { blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); -- cgit v1.2.3-59-g8ed1b From b58a4cc51f4f569c6a86979890f9b237a504ab6b Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Tue, 3 Feb 2009 15:37:18 -0600 Subject: xfs: Update maintainers New maintainer contact info. Signed-off-by: Felix Blyakher --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 474ec0c53272..d23851bf81f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4856,7 +4856,8 @@ S: Supported XFS FILESYSTEM P: Silicon Graphics Inc -P: Bill O'Donnell +P: Felix Blyakher +M: felixb@sgi.com M: xfs-masters@oss.sgi.com L: xfs@oss.sgi.com W: http://oss.sgi.com/projects/xfs -- cgit v1.2.3-59-g8ed1b From f9057e3da79d18fdbd9d6adbb183f032c614feeb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:31:52 +0100 Subject: xfs: cleanup error handling in xfs_mountfs: Clean up the error handling in xfs_mountfs. Use readable goto label names, simplify the uuid handling and other error conditions. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_mount.c | 77 +++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 35300250e86d..86ac80c897c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -886,8 +886,6 @@ xfs_check_sizes(xfs_mount_t *mp) } /* - * xfs_mountfs - * * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct * - if we're a 32-bit kernel, do a size check on the superblock @@ -905,7 +903,6 @@ xfs_mountfs( xfs_inode_t *rip; __uint64_t resblks; uint quotamount, quotaflags; - int uuid_mounted = 0; int error = 0; xfs_mount_common(mp, sbp); @@ -960,7 +957,7 @@ xfs_mountfs( */ error = xfs_update_alignment(mp); if (error) - goto error1; + goto out; xfs_alloc_compute_maxlevels(mp); xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); @@ -977,12 +974,11 @@ xfs_mountfs( * since a single partition filesystem is identical to a single * partition volume/filesystem. */ - if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) { + if (!(mp->m_flags & XFS_MOUNT_NOUUID)) { if (xfs_uuid_mount(mp)) { error = XFS_ERROR(EINVAL); - goto error1; + goto out; } - uuid_mounted=1; } /* @@ -1007,7 +1003,7 @@ xfs_mountfs( */ error = xfs_check_sizes(mp); if (error) - goto error1; + goto out_remove_uuid; /* * Initialize realtime fields in the mount structure @@ -1015,7 +1011,7 @@ xfs_mountfs( error = xfs_rtmount_init(mp); if (error) { cmn_err(CE_WARN, "XFS: RT mount failed"); - goto error1; + goto out_remove_uuid; } /* @@ -1045,26 +1041,26 @@ xfs_mountfs( mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_MAYFAIL); if (!mp->m_perag) - goto error1; + goto out_remove_uuid; mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + if (!sbp->sb_logblocks) { + cmn_err(CE_WARN, "XFS: no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto out_free_perag; + } + /* * log's mount-time initialization. Perform 1st part recovery if needed */ - if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ - error = xfs_log_mount(mp, mp->m_logdev_targp, - XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), - XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); - if (error) { - cmn_err(CE_WARN, "XFS: log mount failed"); - goto error2; - } - } else { /* No log has been defined */ - cmn_err(CE_WARN, "XFS: no log defined"); - XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); - goto error2; + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + cmn_err(CE_WARN, "XFS: log mount failed"); + goto out_free_perag; } /* @@ -1086,15 +1082,14 @@ xfs_mountfs( * If we are currently making the filesystem, the initialisation will * fail as the perag data is in an undefined state. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && !mp->m_sb.sb_inprogress) { error = xfs_initialize_perag_data(mp, sbp->sb_agcount); - if (error) { - goto error2; - } + if (error) + goto out_free_perag; } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -1102,7 +1097,7 @@ xfs_mountfs( error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); if (error) { cmn_err(CE_WARN, "XFS: failed to read root inode"); - goto error3; + goto out_log_dealloc; } ASSERT(rip != NULL); @@ -1116,7 +1111,7 @@ xfs_mountfs( XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); - goto error4; + goto out_rele_rip; } mp->m_rootip = rip; /* save it */ @@ -1131,7 +1126,7 @@ xfs_mountfs( * Free up the root inode. */ cmn_err(CE_WARN, "XFS: failed to read RT inodes"); - goto error4; + goto out_rele_rip; } /* @@ -1143,7 +1138,7 @@ xfs_mountfs( error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { cmn_err(CE_WARN, "XFS: failed to write sb changes"); - goto error4; + goto out_rele_rip; } } @@ -1152,7 +1147,7 @@ xfs_mountfs( */ error = XFS_QM_INIT(mp, "amount, "aflags); if (error) - goto error4; + goto out_rele_rip; /* * Finish recovering the file system. This part needed to be @@ -1162,7 +1157,7 @@ xfs_mountfs( error = xfs_log_mount_finish(mp); if (error) { cmn_err(CE_WARN, "XFS: log mount finish failed"); - goto error4; + goto out_rele_rip; } /* @@ -1170,7 +1165,7 @@ xfs_mountfs( */ error = XFS_QM_MOUNT(mp, quotamount, quotaflags); if (error) - goto error4; + goto out_rele_rip; /* * Now we are mounted, reserve a small amount of unused space for @@ -1194,18 +1189,16 @@ xfs_mountfs( return 0; - error4: - /* - * Free up the root inode. - */ + out_rele_rip: IRELE(rip); - error3: + out_log_dealloc: xfs_log_unmount_dealloc(mp); - error2: + out_free_perag: xfs_free_perag(mp); - error1: - if (uuid_mounted) + out_remove_uuid: + if (!(mp->m_flags & XFS_MOUNT_NOUUID)) uuid_table_remove(&mp->m_sb.sb_uuid); + out: return error; } -- cgit v1.2.3-59-g8ed1b From b93b6e434c046459cf3111c76dce46ba4abcb2b6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:33:58 +0100 Subject: xfs: make sure to free the real-time inodes in the mount error path When mount fails after allocating the real-time inodes we currently leak them. Add a new helper to free the real-time inodes which can be used by both the mount and unmount path. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_mount.c | 15 +++++++-------- fs/xfs/xfs_rtalloc.c | 10 ++++++++++ fs/xfs/xfs_rtalloc.h | 4 ++++ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 86ac80c897c3..664961e45e02 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1138,7 +1138,7 @@ xfs_mountfs( error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { cmn_err(CE_WARN, "XFS: failed to write sb changes"); - goto out_rele_rip; + goto out_rtunmount; } } @@ -1147,7 +1147,7 @@ xfs_mountfs( */ error = XFS_QM_INIT(mp, "amount, "aflags); if (error) - goto out_rele_rip; + goto out_rtunmount; /* * Finish recovering the file system. This part needed to be @@ -1157,7 +1157,7 @@ xfs_mountfs( error = xfs_log_mount_finish(mp); if (error) { cmn_err(CE_WARN, "XFS: log mount finish failed"); - goto out_rele_rip; + goto out_rtunmount; } /* @@ -1165,7 +1165,7 @@ xfs_mountfs( */ error = XFS_QM_MOUNT(mp, quotamount, quotaflags); if (error) - goto out_rele_rip; + goto out_rtunmount; /* * Now we are mounted, reserve a small amount of unused space for @@ -1189,6 +1189,8 @@ xfs_mountfs( return 0; + out_rtunmount: + xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); out_log_dealloc: @@ -1219,10 +1221,7 @@ xfs_unmountfs( */ XFS_QM_UNMOUNT(mp); - if (mp->m_rbmip) - IRELE(mp->m_rbmip); - if (mp->m_rsumip) - IRELE(mp->m_rsumip); + xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c5bb86f3ec05..385f6dceba5d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -2288,6 +2288,16 @@ xfs_rtmount_inodes( return 0; } +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + /* * Pick an extent for allocation at the start of a new realtime file. * Use the sequence number stored in the atime field of the bitmap inode. diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 8d8dcd215716..3bac681218a4 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -108,6 +108,9 @@ xfs_rtfree_extent( int /* error */ xfs_rtmount_init( struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); /* * Get the bitmap and summary inodes into the mount structure @@ -146,6 +149,7 @@ xfs_growfs_rt( # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ #endif /* __KERNEL__ */ -- cgit v1.2.3-59-g8ed1b From cb3f35bb3bf0759e00cd4f68155da9b636421f84 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:34:20 +0100 Subject: xfs: tiny cleanup for xfs_link The source and target inodes are guaranteed to never be the same by the VFS, so no need to check for that (and we would get into bad trouble later anyway if that were the case). Also clean up the error handling to use two gotos instead of nested conditions. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_vnodeops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 0e55c5d7db5f..4229408664ea 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2004,8 +2004,10 @@ xfs_link( /* Return through std_return after this point. */ error = XFS_QM_DQATTACH(mp, sip, 0); - if (!error && sip != tdp) - error = XFS_QM_DQATTACH(mp, tdp, 0); + if (error) + goto std_return; + + error = XFS_QM_DQATTACH(mp, tdp, 0); if (error) goto std_return; -- cgit v1.2.3-59-g8ed1b From c52e9fd8a9d3ac019680ffa315c1a0689d401ce3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:34:34 +0100 Subject: xfs: remove unused XFS_MOUNT_ILOCK/XFS_MOUNT_IUNLOCK These aren't only unused but also reference a lock that doesn't exist anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_mount.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index f5e9937f9bdb..670c10e098a0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -500,9 +500,6 @@ typedef struct xfs_mod_sb { int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; -#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) -#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) - extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); -- cgit v1.2.3-59-g8ed1b From e1486dea0bf4bc75a52a983281076f454a894b66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:36:00 +0100 Subject: xfs: factor out attr fork reset handling We currently duplicate code to reset the attribute fork after the last attribute has been deleted. Factor this out into a small helper. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_attr_leaf.c | 55 +++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 6c323f8a4cd1..aa001629b596 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -297,6 +297,26 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) xfs_sbversion_add_attr2(mp, args->trans); } +/* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +STATIC void +xfs_attr_fork_reset( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + /* * Remove an attribute from the shortform attribute list structure. */ @@ -344,22 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) */ totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && - !(args->op_flags & XFS_DA_OP_ADDNAME) && - (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) { - /* - * Last attribute now removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_reset(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -786,20 +794,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - - /* - * Last attribute was removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + xfs_attr_fork_reset(dp, args->trans); goto out; } -- cgit v1.2.3-59-g8ed1b From d4bb6d0698090c485e2e80e8a13852be5a8bfb04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Feb 2009 09:36:19 +0100 Subject: xfs: merge xfs_inode_flush into xfs_fs_write_inode Splitting the task for a VFS-induced inode flush into two functions doesn't make any sense, so merge the two functions dealing with it. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_super.c | 43 ++++++++++++++++++++++++++++++++++++------ fs/xfs/linux-2.6/xfs_vnode.h | 5 ----- fs/xfs/xfs_vnodeops.c | 45 -------------------------------------------- fs/xfs/xfs_vnodeops.h | 1 - 4 files changed, 37 insertions(+), 57 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c71e226da7f5..faf3aa3ca154 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -990,26 +990,57 @@ xfs_fs_write_inode( int sync) { struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; int error = 0; - int flags = 0; xfs_itrace_entry(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + if (sync) { error = xfs_wait_on_pages(ip, 0, -1); if (error) - goto out_error; - flags |= FLUSH_SYNC; + goto out; + } + + /* + * Bypass inodes which have already been cleaned by + * the inode flush clustering code inside xfs_iflush + */ + if (xfs_inode_clean(ip)) + goto out; + + /* + * We make this non-blocking if the inode is contended, return + * EAGAIN to indicate to the caller that they did not succeed. + * This prevents the flush path from blocking on inodes inside + * another operation right now, they get caught later by xfs_sync. + */ + if (sync) { + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_iflock(ip); + + error = xfs_iflush(ip, XFS_IFLUSH_SYNC); + } else { + error = EAGAIN; + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + goto out; + if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) + goto out_unlock; + + error = xfs_iflush(ip, XFS_IFLUSH_ASYNC_NOBLOCK); } - error = xfs_inode_flush(ip, flags); -out_error: + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + out: /* * if we failed to write out the inode then mark * it dirty again so we'll try again later. */ if (error) xfs_mark_inode_dirty_sync(ip); - return -error; } diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index f65983a230d3..ea4675c48209 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -40,11 +40,6 @@ struct attrlist_cursor_kern; #define IO_ISDIRECT 0x00004 /* bypass page cache */ #define IO_INVIS 0x00020 /* don't update inode timestamps */ -/* - * Flags for xfs_inode_flush - */ -#define FLUSH_SYNC 1 /* wait for flush to complete */ - /* * Flush/Invalidate options for vop_toss/flush/flushinval_pages. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4229408664ea..bc0a0a75b1d6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2588,51 +2588,6 @@ std_return: goto std_return; } -int -xfs_inode_flush( - xfs_inode_t *ip, - int flags) -{ - xfs_mount_t *mp = ip->i_mount; - int error = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Bypass inodes which have already been cleaned by - * the inode flush clustering code inside xfs_iflush - */ - if (xfs_inode_clean(ip)) - return 0; - - /* - * We make this non-blocking if the inode is contended, - * return EAGAIN to indicate to the caller that they - * did not succeed. This prevents the flush path from - * blocking on inodes inside another operation right - * now, they get caught later by xfs_sync. - */ - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { - return EAGAIN; - } - - error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC - : XFS_IFLUSH_ASYNC_NOBLOCK); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - return error; -} - - int xfs_set_dmattrs( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 76df328c61b4..2258df3fae84 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -38,7 +38,6 @@ int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, mode_t mode, struct xfs_inode **ipp, cred_t *credp); -int xfs_inode_flush(struct xfs_inode *ip, int flags); int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); int xfs_reclaim(struct xfs_inode *ip); int xfs_change_file_space(struct xfs_inode *ip, int cmd, -- cgit v1.2.3-59-g8ed1b From ef8f7fc549bf345d92f396f5aa7b152b4969cbf7 Mon Sep 17 00:00:00 2001 From: Josef 'Jeff' Sipek Date: Wed, 4 Feb 2009 09:37:43 +0100 Subject: xfs: cleanup error handling in xfs_swap_extents Use multiple lables for proper error unwinding and get rid of some now superflous variables. Signed-off-by: Josef 'Jeff' Sipek Signed-off-by: Christoph Hellwig Tested-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 62 ++++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f8278cfcc1d3..ac96ab9f70a2 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -118,19 +118,17 @@ xfs_swap_extents( xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; int ilf_fields, tilf_fields; - static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; int error = 0; int aforkblks = 0; int taforkblks = 0; __uint64_t tmp; - char locked = 0; mp = ip->i_mount; tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); - goto error0; + goto out; } sbp = &sxp->sx_stat; @@ -143,25 +141,24 @@ xfs_swap_extents( */ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - locked = 1; /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Should never get a local format */ if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } if (VN_CACHED(VFS_I(tip)) != 0) { @@ -169,13 +166,13 @@ xfs_swap_extents( error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); if (error) - goto error0; + goto out_unlock; } /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* Verify all data are being swapped */ @@ -183,7 +180,7 @@ xfs_swap_extents( sxp->sx_length != ip->i_d.di_size || sxp->sx_length != tip->i_d.di_size) { error = XFS_ERROR(EFAULT); - goto error0; + goto out_unlock; } /* @@ -193,7 +190,7 @@ xfs_swap_extents( */ if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { error = XFS_ERROR(EINVAL); - goto error0; + goto out_unlock; } /* @@ -208,7 +205,7 @@ xfs_swap_extents( (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { error = XFS_ERROR(EBUSY); - goto error0; + goto out_unlock; } /* We need to fail if the file is memory mapped. Once we have tossed @@ -219,7 +216,7 @@ xfs_swap_extents( */ if (VN_MAPPED(VFS_I(ip))) { error = XFS_ERROR(EBUSY); - goto error0; + goto out_unlock; } xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -242,8 +239,7 @@ xfs_swap_extents( xfs_iunlock(ip, XFS_IOLOCK_EXCL); xfs_iunlock(tip, XFS_IOLOCK_EXCL); xfs_trans_cancel(tp, 0); - locked = 0; - goto error0; + goto out; } xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); @@ -253,19 +249,15 @@ xfs_swap_extents( if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } + if (error) + goto out_trans_cancel; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &taforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } + if (error) + goto out_trans_cancel; } /* @@ -332,10 +324,10 @@ xfs_swap_extents( IHOLD(ip); - xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); IHOLD(tip); - xfs_trans_ijoin(tp, tip, lock_flags); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_log_inode(tp, ip, ilf_fields); xfs_trans_log_inode(tp, tip, tilf_fields); @@ -344,19 +336,19 @@ xfs_swap_extents( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - } error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); - locked = 0; - error0: - if (locked) { - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - } - if (tempifp != NULL) - kmem_free(tempifp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out: + kmem_free(tempifp); return error; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; } -- cgit v1.2.3-59-g8ed1b From d9793bd8018f835c64b10f44e278c86cecb8e932 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Feb 2009 20:20:41 -0200 Subject: trace: judicious error checking of trace_seq results Impact: bugfix and cleanup Some callsites were returning either TRACE_ITER_PARTIAL_LINE if the trace_seq routines (trace_seq_printf, etc) returned 0 meaning its buffer was full, or zero otherwise. But... /* Return values for print_line callback */ enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ }; In other cases the return value was not being relayed at all. Most of the time it didn't hurt because the page wasn't get filled, but for correctness sake, handle the return values everywhere. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- block/blktrace.c | 2 +- kernel/trace/trace.c | 75 ++++++++++++--------------- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_output.c | 123 ++++++++++++++++++-------------------------- 4 files changed, 87 insertions(+), 115 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 8f5c37b0f80f..12df27693972 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1165,7 +1165,7 @@ static int blk_trace_event_print(struct trace_iterator *iter, int flags) const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); int ret; - if (trace_print_context(iter)) + if (!trace_print_context(iter)) return TRACE_TYPE_PARTIAL_LINE; if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bbdfaa2cbdb9..5822ff4e5a3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1402,27 +1402,25 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter) unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_event *event; struct trace_entry *entry = iter->ent; - int ret; test_cpu_buff_start(iter); event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - ret = trace_print_lat_context(iter); - if (ret) - return ret; + if (!trace_print_lat_context(iter)) + goto partial; } - if (event && event->latency_trace) { - ret = event->latency_trace(iter, sym_flags); - if (ret) - return ret; - return TRACE_TYPE_HANDLED; - } + if (event && event->latency_trace) + return event->latency_trace(iter, sym_flags); + + if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) + goto partial; - trace_seq_printf(s, "Unknown type %d\n", entry->type); return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) @@ -1431,7 +1429,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_entry *entry; struct trace_event *event; - int ret; entry = iter->ent; @@ -1440,22 +1437,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - ret = trace_print_context(iter); - if (ret) - return ret; + if (!trace_print_context(iter)) + goto partial; } - if (event && event->trace) { - ret = event->trace(iter, sym_flags); - if (ret) - return ret; - return TRACE_TYPE_HANDLED; - } - ret = trace_seq_printf(s, "Unknown type %d\n", entry->type); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (event && event->trace) + return event->trace(iter, sym_flags); + + if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) + goto partial; return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) @@ -1463,29 +1457,25 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry; struct trace_event *event; - int ret; entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (!trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts)) + goto partial; } event = ftrace_find_event(entry->type); - if (event && event->raw) { - ret = event->raw(iter, 0); - if (ret) - return ret; - return TRACE_TYPE_HANDLED; - } - ret = trace_seq_printf(s, "%d ?\n", entry->type); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (event && event->raw) + return event->raw(iter, 0); + + if (!trace_seq_printf(s, "%d ?\n", entry->type)) + goto partial; return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) @@ -1504,8 +1494,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - if (event && event->hex) - event->hex(iter, 0); + if (event && event->hex) { + int ret = event->hex(iter, 0); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } SEQ_PUT_FIELD_RET(s, newline); @@ -1544,7 +1537,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event && event->binary) - event->binary(iter, 0); + return event->binary(iter, 0); return TRACE_TYPE_HANDLED; } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ea62f101e615..f6b35e162dfa 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -173,7 +173,7 @@ static int trace_branch_print(struct trace_iterator *iter, int flags) field->line)) return TRACE_TYPE_PARTIAL_LINE; - return 0; + return TRACE_TYPE_HANDLED; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c24503b281a0..5b3c914053f2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -286,55 +286,41 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) return ret; } -static void +static int lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; char *comm; comm = trace_find_cmdline(entry->pid); - - trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%3d", cpu); - trace_seq_printf(s, "%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.', - ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); - hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; - if (hardirq && softirq) { - trace_seq_putc(s, 'H'); - } else { - if (hardirq) { - trace_seq_putc(s, 'h'); - } else { - if (softirq) - trace_seq_putc(s, 's'); - else - trace_seq_putc(s, '.'); - } - } + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", + comm, entry->pid, cpu, + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? + 'X' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? + 'N' : '.', + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : softirq ? 's' : '.')) + return 0; if (entry->preempt_count) - trace_seq_printf(s, "%x", entry->preempt_count); - else - trace_seq_puts(s, "."); + return trace_seq_printf(s, "%x", entry->preempt_count); + return trace_seq_puts(s, "."); } static unsigned long preempt_mark_thresh = 100; -static void +static int lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, unsigned long rel_usecs) { - trace_seq_printf(s, " %4lldus", abs_usecs); - if (rel_usecs > preempt_mark_thresh) - trace_seq_puts(s, "!: "); - else if (rel_usecs > 1) - trace_seq_puts(s, "+: "); - else - trace_seq_puts(s, " : "); + return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, + rel_usecs > preempt_mark_thresh ? '!' : + rel_usecs > 1 ? '+' : ' '); } int trace_print_context(struct trace_iterator *iter) @@ -346,22 +332,14 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; - if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid)) - goto partial; - if (!trace_seq_printf(s, "[%03d] ", entry->cpu)) - goto partial; - if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem)) - goto partial; - - return 0; - -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", + comm, entry->pid, entry->cpu, secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; + int ret; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent, *next_entry = trace_find_next_entry(iter, NULL, @@ -376,21 +354,22 @@ int trace_print_lat_context(struct trace_iterator *iter) if (verbose) { char *comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" - " %ld.%03ldms (+%ld.%03ldms): ", - comm, - entry->pid, entry->cpu, entry->flags, - entry->preempt_count, iter->idx, - ns2usecs(iter->ts), - abs_usecs/1000, - abs_usecs % 1000, rel_usecs/1000, - rel_usecs % 1000); + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" + " %ld.%03ldms (+%ld.%03ldms): ", comm, + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, iter->idx, + ns2usecs(iter->ts), + abs_usecs / USEC_PER_MSEC, + abs_usecs % USEC_PER_MSEC, + rel_usecs / USEC_PER_MSEC, + rel_usecs % USEC_PER_MSEC); } else { - lat_print_generic(s, entry, entry->cpu); - lat_print_timestamp(s, abs_usecs, rel_usecs); + ret = lat_print_generic(s, entry, entry->cpu); + if (ret) + ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } - return 0; + return ret; } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; @@ -486,7 +465,7 @@ int unregister_ftrace_event(struct trace_event *event) int trace_nop_print(struct trace_iterator *iter, int flags) { - return 0; + return TRACE_TYPE_HANDLED; } /* TRACE_FN */ @@ -506,7 +485,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")\n")) goto partial; - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; @@ -533,7 +512,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags) if (!trace_seq_printf(s, "\n")) goto partial; - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; @@ -550,7 +529,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags) field->parent_ip)) return TRACE_TYPE_PARTIAL_LINE; - return 0; + return TRACE_TYPE_HANDLED; } static int trace_fn_hex(struct trace_iterator *iter, int flags) @@ -563,7 +542,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags) SEQ_PUT_HEX_FIELD_RET(s, field->ip); SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); - return 0; + return TRACE_TYPE_HANDLED; } static int trace_fn_bin(struct trace_iterator *iter, int flags) @@ -576,7 +555,7 @@ static int trace_fn_bin(struct trace_iterator *iter, int flags) SEQ_PUT_FIELD_RET(s, field->ip); SEQ_PUT_FIELD_RET(s, field->parent_ip); - return 0; + return TRACE_TYPE_HANDLED; } static struct trace_event trace_fn_event = { @@ -611,7 +590,7 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim) T, comm)) return TRACE_TYPE_PARTIAL_LINE; - return 0; + return TRACE_TYPE_HANDLED; } static int trace_ctx_print(struct trace_iterator *iter, int flags) @@ -644,7 +623,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) T)) return TRACE_TYPE_PARTIAL_LINE; - return 0; + return TRACE_TYPE_HANDLED; } static int trace_ctx_raw(struct trace_iterator *iter, int flags) @@ -678,7 +657,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); - return 0; + return TRACE_TYPE_HANDLED; } static int trace_ctx_hex(struct trace_iterator *iter, int flags) @@ -705,7 +684,7 @@ static int trace_ctxwake_bin(struct trace_iterator *iter, int flags) SEQ_PUT_FIELD_RET(s, field->next_prio); SEQ_PUT_FIELD_RET(s, field->next_state); - return 0; + return TRACE_TYPE_HANDLED; } static struct trace_event trace_ctx_event = { @@ -739,7 +718,7 @@ static int trace_special_print(struct trace_iterator *iter, int flags) field->arg3)) return TRACE_TYPE_PARTIAL_LINE; - return 0; + return TRACE_TYPE_HANDLED; } static int trace_special_hex(struct trace_iterator *iter, int flags) @@ -753,7 +732,7 @@ static int trace_special_hex(struct trace_iterator *iter, int flags) SEQ_PUT_HEX_FIELD_RET(s, field->arg2); SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - return 0; + return TRACE_TYPE_HANDLED; } static int trace_special_bin(struct trace_iterator *iter, int flags) @@ -767,7 +746,7 @@ static int trace_special_bin(struct trace_iterator *iter, int flags) SEQ_PUT_FIELD_RET(s, field->arg2); SEQ_PUT_FIELD_RET(s, field->arg3); - return 0; + return TRACE_TYPE_HANDLED; } static struct trace_event trace_special_event = { @@ -801,7 +780,7 @@ static int trace_stack_print(struct trace_iterator *iter, int flags) goto partial; } - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; @@ -830,7 +809,7 @@ static int trace_user_stack_print(struct trace_iterator *iter, int flags) if (!trace_seq_putc(s, '\n')) goto partial; - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; @@ -859,7 +838,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags) if (!trace_seq_printf(s, ": %s", field->buf)) goto partial; - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; @@ -874,7 +853,7 @@ static int trace_print_raw(struct trace_iterator *iter, int flags) if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) goto partial; - return 0; + return TRACE_TYPE_HANDLED; partial: return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From ae7462b4f1fe1f36b5d562dbd5202a2eba01f072 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 3 Feb 2009 22:05:50 -0200 Subject: trace: make the trace_event callbacks return enum print_line_t As they actually all return these enumerators. Reported-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- block/blktrace.c | 6 ++++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_branch.c | 3 ++- kernel/trace/trace_output.c | 52 +++++++++++++++++++++++++++------------------ kernel/trace/trace_output.h | 5 +++-- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 12df27693972..c7698d1617a1 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1158,7 +1158,8 @@ static struct { [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, }; -static int blk_trace_event_print(struct trace_iterator *iter, int flags) +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) { struct trace_seq *s = &iter->seq; const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; @@ -1196,7 +1197,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) sizeof(old) - offset + t->pdu_len); } -static int blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags) { return blk_trace_synthesize_old_trace(iter) ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5822ff4e5a3e..fd51cf0b94c7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1495,7 +1495,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event && event->hex) { - int ret = event->hex(iter, 0); + enum print_line_t ret = event->hex(iter, 0); if (ret != TRACE_TYPE_HANDLED) return ret; } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index f6b35e162dfa..7ac72a44b2d3 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -160,7 +160,8 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int trace_branch_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_branch_print(struct trace_iterator *iter, + int flags) { struct trace_branch *field; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5b3c914053f2..b7380eee9fa1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -463,13 +463,14 @@ int unregister_ftrace_event(struct trace_event *event) * Standard events */ -int trace_nop_print(struct trace_iterator *iter, int flags) +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) { return TRACE_TYPE_HANDLED; } /* TRACE_FN */ -static int trace_fn_latency(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_latency(struct trace_iterator *iter, + int flags) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -491,7 +492,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int trace_fn_trace(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -518,7 +519,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int trace_fn_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; @@ -532,7 +533,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static int trace_fn_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -545,7 +546,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static int trace_fn_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -568,7 +569,8 @@ static struct trace_event trace_fn_event = { }; /* TRACE_CTX an TRACE_WAKE */ -static int trace_ctxwake_print(struct trace_iterator *iter, char *delim) +static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, + char *delim) { struct ctx_switch_entry *field; char *comm; @@ -593,12 +595,13 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim) return TRACE_TYPE_HANDLED; } -static int trace_ctx_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) { return trace_ctxwake_print(iter, "==>"); } -static int trace_wake_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_print(struct trace_iterator *iter, + int flags) { return trace_ctxwake_print(iter, " +"); } @@ -626,12 +629,12 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static int trace_ctx_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) { return trace_ctxwake_raw(iter, 0); } -static int trace_wake_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) { return trace_ctxwake_raw(iter, '+'); } @@ -660,17 +663,18 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static int trace_ctx_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) { return trace_ctxwake_hex(iter, 0); } -static int trace_wake_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) { return trace_ctxwake_hex(iter, '+'); } -static int trace_ctxwake_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, + int flags) { struct ctx_switch_entry *field; struct trace_seq *s = &iter->seq; @@ -706,7 +710,8 @@ static struct trace_event trace_wake_event = { }; /* TRACE_SPECIAL */ -static int trace_special_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_special_print(struct trace_iterator *iter, + int flags) { struct special_entry *field; @@ -721,7 +726,8 @@ static int trace_special_print(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static int trace_special_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_special_hex(struct trace_iterator *iter, + int flags) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -735,7 +741,8 @@ static int trace_special_hex(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static int trace_special_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_special_bin(struct trace_iterator *iter, + int flags) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -760,7 +767,8 @@ static struct trace_event trace_special_event = { /* TRACE_STACK */ -static int trace_stack_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_stack_print(struct trace_iterator *iter, + int flags) { struct stack_entry *field; struct trace_seq *s = &iter->seq; @@ -796,7 +804,8 @@ static struct trace_event trace_stack_event = { }; /* TRACE_USER_STACK */ -static int trace_user_stack_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, + int flags) { struct userstack_entry *field; struct trace_seq *s = &iter->seq; @@ -825,7 +834,8 @@ static struct trace_event trace_user_stack_event = { }; /* TRACE_PRINT */ -static int trace_print_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags) { struct print_entry *field; struct trace_seq *s = &iter->seq; @@ -844,7 +854,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static int trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) { struct print_entry *field; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 3aeb31f6506b..551a25a72217 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -3,7 +3,8 @@ #include "trace.h" -typedef int (*trace_print_func)(struct trace_iterator *iter, int flags); +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags); struct trace_event { struct hlist_node node; @@ -39,7 +40,7 @@ struct trace_event *ftrace_find_event(int type); int register_ftrace_event(struct trace_event *event); int unregister_ftrace_event(struct trace_event *event); -int trace_nop_print(struct trace_iterator *iter, int flags); +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) -- cgit v1.2.3-59-g8ed1b From f036be96dd9ce442ffb9ab33e3c165f5178815c0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Feb 2009 13:45:43 +0100 Subject: printk: introduce printk_once() This pattern shows up frequently in the kernel: static int once = 1; ... if (once) { once = 0; printk(KERN_ERR "message\n"); } ... So add a printk_once() helper macro that reduces this to a single line of: printk_once(KERN_ERR "message\n"); It works analogously to WARN_ONCE() & friends. (We use a macro not an inline because vararg expansion in inlines looks awkward and the macro is simple enough.) Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 343df9ef2412..3c183d9864ae 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -242,6 +242,19 @@ extern struct ratelimit_state printk_ratelimit_state; extern int printk_ratelimit(void); extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msec); + +/* + * Print a one-time message (analogous to WARN_ONCE() et al): + */ +#define printk_once(x...) ({ \ + static int __print_once = 1; \ + \ + if (__print_once) { \ + __print_once = 0; \ + printk(x); \ + } \ +}) + #else static inline int vprintk(const char *s, va_list args) __attribute__ ((format (printf, 1, 0))); @@ -253,6 +266,10 @@ static inline int printk_ratelimit(void) { return 0; } static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ unsigned int interval_msec) \ { return false; } + +/* No effect, but we still get type checking even in the !PRINTK case: */ +#define printk_once(x...) printk(x) + #endif extern int printk_needs_cpu(int cpu); -- cgit v1.2.3-59-g8ed1b From 268ccda0cb4d1292029d07ee3dbd07117baf6ecb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Feb 2009 20:16:39 -0200 Subject: trace: assign defaults at register_ftrace_event Impact: simplification of tracers As all tracers are doing this we might as well do it in register_ftrace_event and save one branch each time we call these callbacks. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- block/blktrace.c | 2 -- kernel/trace/trace.c | 13 +++++-------- kernel/trace/trace_branch.c | 3 --- kernel/trace/trace_output.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index c7698d1617a1..1ebd068061ec 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1243,8 +1243,6 @@ static struct trace_event trace_blk_event = { .type = TRACE_BLK, .trace = blk_trace_event_print, .latency_trace = blk_trace_event_print, - .raw = trace_nop_print, - .hex = trace_nop_print, .binary = blk_trace_event_print_binary, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd51cf0b94c7..a5e4c0af9bb0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1412,7 +1412,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter) goto partial; } - if (event && event->latency_trace) + if (event) return event->latency_trace(iter, sym_flags); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) @@ -1441,7 +1441,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) goto partial; } - if (event && event->trace) + if (event) return event->trace(iter, sym_flags); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) @@ -1467,7 +1467,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - if (event && event->raw) + if (event) return event->raw(iter, 0); if (!trace_seq_printf(s, "%d ?\n", entry->type)) @@ -1494,7 +1494,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - if (event && event->hex) { + if (event) { enum print_line_t ret = event->hex(iter, 0); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -1536,10 +1536,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - if (event && event->binary) - return event->binary(iter, 0); - - return TRACE_TYPE_HANDLED; + return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; } static int trace_empty(struct trace_iterator *iter) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7ac72a44b2d3..297deb202b68 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -182,9 +182,6 @@ static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, .trace = trace_branch_print, .latency_trace = trace_branch_print, - .raw = trace_nop_print, - .hex = trace_nop_print, - .binary = trace_nop_print, }; static struct tracer branch_trace __read_mostly = diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b7380eee9fa1..b6e99af79214 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -435,6 +435,17 @@ int register_ftrace_event(struct trace_event *event) if (ftrace_find_event(event->type)) goto out; + if (event->trace == NULL) + event->trace = trace_nop_print; + if (event->latency_trace == NULL) + event->latency_trace = trace_nop_print; + if (event->raw == NULL) + event->raw = trace_nop_print; + if (event->hex == NULL) + event->hex = trace_nop_print; + if (event->binary == NULL) + event->binary = trace_nop_print; + key = event->type & (EVENT_HASHSIZE - 1); hlist_add_head_rcu(&event->node, &event_hash[key]); @@ -874,8 +885,6 @@ static struct trace_event trace_print_event = { .trace = trace_print_print, .latency_trace = trace_print_print, .raw = trace_print_raw, - .hex = trace_nop_print, - .binary = trace_nop_print, }; static struct trace_event *events[] __initdata = { -- cgit v1.2.3-59-g8ed1b From 97e5b191ae7dc0f4f5b82b9db29782928b103b4d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Feb 2009 01:13:36 -0500 Subject: trace_branch: Remove unused function Impact: cleanup Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_branch.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 297deb202b68..027e83690615 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -143,23 +143,6 @@ static void branch_trace_reset(struct trace_array *tr) stop_branch_trace(tr); } -static int -trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags) -{ - struct print_entry *field; - - trace_assign_type(field, entry); - - if (seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (trace_seq_printf(s, ": %s", field->buf)) - goto partial; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - static enum print_line_t trace_branch_print(struct trace_iterator *iter, int flags) { -- cgit v1.2.3-59-g8ed1b From 7be421510b91491d5aa5a29fa1005712039b95af Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Feb 2009 01:13:37 -0500 Subject: trace: Remove unused trace_array_cpu parameter Impact: cleanup Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- block/blktrace.c | 2 +- kernel/trace/trace.c | 47 +++++++++++++++------------------------ kernel/trace/trace.h | 4 ---- kernel/trace/trace_functions.c | 8 +++---- kernel/trace/trace_irqsoff.c | 10 ++++----- kernel/trace/trace_sched_switch.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 12 +++++----- 7 files changed, 35 insertions(+), 52 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 1ebd068061ec..d9d7146ee023 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -245,7 +245,7 @@ record_it: if (pid != 0 && !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) && (trace_flags & TRACE_ITER_STACKTRACE) != 0) - __trace_stack(blk_tr, NULL, flags, 5, pc); + __trace_stack(blk_tr, flags, 5, pc); trace_wake_up(); return; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a5e4c0af9bb0..1d4ff568cc4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -776,7 +776,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } void -trace_function(struct trace_array *tr, struct trace_array_cpu *data, +trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { @@ -802,7 +802,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static void __trace_graph_entry(struct trace_array *tr, - struct trace_array_cpu *data, struct ftrace_graph_ent *trace, unsigned long flags, int pc) @@ -826,7 +825,6 @@ static void __trace_graph_entry(struct trace_array *tr, } static void __trace_graph_return(struct trace_array *tr, - struct trace_array_cpu *data, struct ftrace_graph_ret *trace, unsigned long flags, int pc) @@ -856,11 +854,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, int pc) { if (likely(!atomic_read(&data->disabled))) - trace_function(tr, data, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, flags, pc); } static void __ftrace_trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, unsigned long flags, int skip, int pc) { @@ -891,27 +888,24 @@ static void __ftrace_trace_stack(struct trace_array *tr, } static void ftrace_trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, unsigned long flags, int skip, int pc) { if (!(trace_flags & TRACE_ITER_STACKTRACE)) return; - __ftrace_trace_stack(tr, data, flags, skip, pc); + __ftrace_trace_stack(tr, flags, skip, pc); } void __trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr, data, flags, skip, pc); + __ftrace_trace_stack(tr, flags, skip, pc); } static void ftrace_trace_userstack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags, int pc) + unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE struct ring_buffer_event *event; @@ -942,20 +936,17 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif } -void __trace_userstack(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long flags) +void __trace_userstack(struct trace_array *tr, unsigned long flags) { - ftrace_trace_userstack(tr, data, flags, preempt_count()); + ftrace_trace_userstack(tr, flags, preempt_count()); } static void -ftrace_trace_special(void *__tr, void *__data, +ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { struct ring_buffer_event *event; - struct trace_array_cpu *data = __data; struct trace_array *tr = __tr; struct special_entry *entry; unsigned long irq_flags; @@ -971,8 +962,8 @@ ftrace_trace_special(void *__tr, void *__data, entry->arg2 = arg2; entry->arg3 = arg3; ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - ftrace_trace_stack(tr, data, irq_flags, 4, pc); - ftrace_trace_userstack(tr, data, irq_flags, pc); + ftrace_trace_stack(tr, irq_flags, 4, pc); + ftrace_trace_userstack(tr, irq_flags, pc); trace_wake_up(); } @@ -981,12 +972,11 @@ void __trace_special(void *__tr, void *__data, unsigned long arg1, unsigned long arg2, unsigned long arg3) { - ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); + ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); } void tracing_sched_switch_trace(struct trace_array *tr, - struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc) @@ -1010,13 +1000,12 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - ftrace_trace_stack(tr, data, flags, 5, pc); - ftrace_trace_userstack(tr, data, flags, pc); + ftrace_trace_stack(tr, flags, 5, pc); + ftrace_trace_userstack(tr, flags, pc); } void tracing_sched_wakeup_trace(struct trace_array *tr, - struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *curr, unsigned long flags, int pc) @@ -1040,8 +1029,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - ftrace_trace_stack(tr, data, flags, 6, pc); - ftrace_trace_userstack(tr, data, flags, pc); + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); trace_wake_up(); } @@ -1064,7 +1053,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) data = tr->data[cpu]; if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); + ftrace_trace_special(tr, arg1, arg2, arg3, pc); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -1092,7 +1081,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_graph_entry(tr, data, trace, flags, pc); + __trace_graph_entry(tr, trace, flags, pc); } /* Only do the atomic if it is not already set */ if (!test_tsk_trace_graph(current)) @@ -1118,7 +1107,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_graph_return(tr, data, trace, flags, pc); + __trace_graph_return(tr, trace, flags, pc); } if (!trace->depth) clear_tsk_trace_graph(current); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f0c7a0f08cac..df627a948694 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -419,14 +419,12 @@ void ftrace(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, - struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, - struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); @@ -436,7 +434,6 @@ void trace_special(struct trace_array *tr, unsigned long arg2, unsigned long arg3, int pc); void trace_function(struct trace_array *tr, - struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); @@ -462,7 +459,6 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); void __trace_stack(struct trace_array *tr, - struct trace_array_cpu *data, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3a320f8aba7..d067cea2ccc3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -78,7 +78,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); ftrace_preempt_enable(resched); @@ -108,7 +108,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (likely(disabled == 1)) { pc = preempt_count(); - trace_function(tr, data, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, flags, pc); } atomic_dec(&data->disabled); @@ -139,7 +139,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) if (likely(disabled == 1)) { pc = preempt_count(); - trace_function(tr, data, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, flags, pc); /* * skip over 5 funcs: * __ftrace_trace_stack, @@ -148,7 +148,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) * ftrace_list_func * ftrace_call */ - __trace_stack(tr, data, flags, 5, pc); + __trace_stack(tr, flags, 5, pc); } atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ed344b022a14..c6b442d88de8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -153,7 +153,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); + trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); latency = nsecs_to_usecs(delta); @@ -177,7 +177,7 @@ out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); tracing_reset(tr, cpu); - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); + trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -210,7 +210,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + trace_function(tr, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -244,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags, preempt_count()); + trace_function(tr, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index df175cb4564f..c4f9add5ec90 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -43,7 +43,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, data = ctx_trace->data[cpu]; if (likely(!atomic_read(&data->disabled))) - tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); + tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); local_irq_restore(flags); } @@ -66,7 +66,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) data = ctx_trace->data[cpu]; if (likely(!atomic_read(&data->disabled))) - tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, + tracing_sched_wakeup_trace(ctx_trace, wakee, current, flags, pc); local_irq_restore(flags); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a48c9b4b0c85..96d716485898 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -72,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (task_cpu(wakeup_task) != cpu) goto unlock; - trace_function(tr, data, ip, parent_ip, flags, pc); + trace_function(tr, ip, parent_ip, flags, pc); unlock: __raw_spin_unlock(&wakeup_lock); @@ -152,8 +152,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); - tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc); + trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -254,10 +254,8 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) data = wakeup_trace->data[wakeup_cpu]; data->preempt_timestamp = ftrace_now(cpu); - tracing_sched_wakeup_trace(wakeup_trace, data, p, current, - flags, pc); - trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, - flags, pc); + tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); -- cgit v1.2.3-59-g8ed1b From dac74940289f350c2590bec92737833bad608541 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 01:13:38 -0500 Subject: trace: code style clean up Ingo Molnar suggested using goto logic to keep the indentation down and to be able to remove the nasty line breaks. This actually makes the code a bit more readable. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1d4ff568cc4d..3536ef41575d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -522,23 +522,24 @@ int register_tracer(struct tracer *type) tracing_selftest_running = false; mutex_unlock(&trace_types_lock); - if (!ret && default_bootup_tracer) { - if (!strncmp(default_bootup_tracer, type->name, - BOOTUP_TRACER_SIZE)) { - printk(KERN_INFO "Starting tracer '%s'\n", - type->name); - /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); - default_bootup_tracer = NULL; - /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; + if (ret || !default_bootup_tracer) + goto out_unlock; + + if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) + goto out_unlock; + + printk(KERN_INFO "Starting tracer '%s'\n", type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(type->name); + default_bootup_tracer = NULL; + /* disable other selftests, since this will break it. */ + tracing_selftest_disabled = 1; #ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due" - " to running tracer '%s'\n", type->name); + printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", + type->name); #endif - } - } + out_unlock: lock_kernel(); return ret; } -- cgit v1.2.3-59-g8ed1b From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 4 Feb 2009 20:35:48 -0800 Subject: softlockup: check all tasks in hung_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: extend the scope of hung-task checks Changed the default value of hung_task_check_count to PID_MAX_LIMIT. hung_task_batch_count added to put an upper bound on the critical section. Every hung_task_batch_count checks, the rcu lock is never held for a too long time. Keeping the critical section small minimizes time preemption is disabled and keeps rcu grace periods small. To prevent following a stale pointer, get_task_struct is called on g and t. To verify that g and t have not been unhashed while outside the critical section, the task states are checked. The design was proposed by Frédéric Weisbecker. Signed-off-by: Mandeep Singh Baines Suggested-by: Frédéric Weisbecker Acked-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ba8ccd432963..481ca8b5c2bc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -17,9 +17,18 @@ #include /* - * Have a reasonable limit on the number of tasks checked: + * The number of tasks checked: */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; + +/* + * Limit number of tasks checked in a batch. + * + * This value controls the preemptibility of khungtaskd since preemption + * is disabled during the critical section. It also controls the size of + * the RCU grace period. So it needs to be upper-bound. + */ +#define HUNG_TASK_BATCHING 1024 /* * Zero means infinite timeout - no checking done: @@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now, panic("hung_task: blocked tasks"); } +/* + * To avoid extending the RCU grace period for an unbounded amount of time, + * periodically exit the critical section and enter a new one. + * + * For preemptible RCU it is sufficient to call rcu_read_unlock in order + * exit the grace period. For classic RCU, a reschedule is required. + */ +static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + put_task_struct(t); + put_task_struct(g); +} + /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for * a really long time (120 seconds). If that happens, print out @@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now, static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; + int batch_count = HUNG_TASK_BATCHING; unsigned long now = get_timestamp(); struct task_struct *g, *t; @@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) do_each_thread(g, t) { if (!--max_count) goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + rcu_lock_break(g, t); + /* Exit if t or g was unhashed during refresh. */ + if (t->state == TASK_DEAD || g->state == TASK_DEAD) + goto unlock; + } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now, timeout); -- cgit v1.2.3-59-g8ed1b From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 5 Feb 2009 09:56:08 -0800 Subject: softlockup: convert read_lock in hung_task to rcu_read_lock Since the tasklist is protected by rcu list operations, it is safe to convert the read_lock()s to rcu_read_lock(). Suggested-by: Peter Zijlstra Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 481ca8b5c2bc..3951a80e7cbe 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g, t) { if (!--max_count) goto unlock; @@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) check_hung_task(t, now, timeout); } while_each_thread(g, t); unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); } static void update_poll_jiffies(void) -- cgit v1.2.3-59-g8ed1b From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Feb 2009 16:12:56 -0200 Subject: ring_buffer: remove unused flags parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: API change, cleanup >From ring_buffer_{lock_reserve,unlock_commit}. $ codiff /tmp/vmlinux.before /tmp/vmlinux.after linux-2.6-tip/kernel/trace/trace.c: trace_vprintk | -14 trace_graph_return | -14 trace_graph_entry | -10 trace_function | -8 __ftrace_trace_stack | -8 ftrace_trace_userstack | -8 tracing_sched_switch_trace | -8 ftrace_trace_special | -12 tracing_sched_wakeup_trace | -8 9 functions changed, 90 bytes removed, diff: -90 linux-2.6-tip/block/blktrace.c: __blk_add_trace | -1 1 function changed, 1 bytes removed, diff: -1 /tmp/vmlinux.after: 10 functions changed, 91 bytes removed, diff: -91 Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frédéric Weisbecker Signed-off-by: Ingo Molnar --- block/blktrace.c | 8 +++--- include/linux/ring_buffer.h | 9 +++---- kernel/trace/kmemtrace.c | 12 +++------ kernel/trace/ring_buffer.c | 9 ++----- kernel/trace/trace.c | 56 ++++++++++++++-------------------------- kernel/trace/trace_boot.c | 12 +++------ kernel/trace/trace_branch.c | 7 +++-- kernel/trace/trace_hw_branches.c | 6 ++--- kernel/trace/trace_mmiotrace.c | 12 +++------ kernel/trace/trace_power.c | 12 +++------ 10 files changed, 51 insertions(+), 92 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index d9d7146ee023..8e52f24cc8f9 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -165,7 +165,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct blk_io_trace *t; - unsigned long flags; + unsigned long flags = 0; unsigned long *sequence; pid_t pid; int cpu, pc = 0; @@ -191,7 +191,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, tracing_record_cmdline(current); event = ring_buffer_lock_reserve(blk_tr->buffer, - sizeof(*t) + pdu_len, &flags); + sizeof(*t) + pdu_len); if (!event) return; @@ -241,11 +241,11 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tr) { - ring_buffer_unlock_commit(blk_tr->buffer, event, flags); + ring_buffer_unlock_commit(blk_tr->buffer, event); if (pid != 0 && !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) && (trace_flags & TRACE_ITER_STACKTRACE) != 0) - __trace_stack(blk_tr, flags, 5, pc); + __trace_stack(blk_tr, 0, 5, pc); trace_wake_up(); return; } diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b3b359660082..3110d92e7d81 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer); int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); -struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, - unsigned long length, - unsigned long *flags); +struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length); int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags); + struct ring_buffer_event *event); int ring_buffer_write(struct ring_buffer *buffer, unsigned long length, void *data); diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index f04c0625f1cd..256749d1032a 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, struct ring_buffer_event *event; struct kmemtrace_alloc_entry *entry; struct trace_array *tr = kmemtrace_array; - unsigned long irq_flags; if (!kmem_tracing_enabled) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id, struct ring_buffer_event *event; struct kmemtrace_free_entry *entry; struct trace_array *tr = kmemtrace_array; - unsigned long irq_flags; if (!kmem_tracing_enabled) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374ceef..aee76b3eeed2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched); * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) - * @flags: a pointer to save the interrupt flags * * Returns a reseverd event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into @@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched); * If NULL is returned, then nothing has been allocated or locked. */ struct ring_buffer_event * -ring_buffer_lock_reserve(struct ring_buffer *buffer, - unsigned long length, - unsigned long *flags) +ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; @@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to * @event: The event pointer to commit. - * @flags: the interrupt flags received from ring_buffer_lock_reserve. * * This commits the data to the ring buffer, and releases any locks held. * * Must be paired with ring_buffer_lock_reserve. */ int ring_buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event, - unsigned long flags) + struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; int cpu = raw_smp_processor_id(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3536ef41575d..eb453a238a6f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -783,14 +783,12 @@ trace_function(struct trace_array *tr, { struct ring_buffer_event *event; struct ftrace_entry *entry; - unsigned long irq_flags; /* If we are reading the ring buffer, don't trace */ if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -798,7 +796,7 @@ trace_function(struct trace_array *tr, entry->ent.type = TRACE_FN; entry->ip = ip; entry->parent_ip = parent_ip; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr, { struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; - unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_GRAPH_ENT; entry->graph_ent = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); + ring_buffer_unlock_commit(global_trace.buffer, event); } static void __trace_graph_return(struct trace_array *tr, @@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr, { struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; - unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, flags, pc); entry->ent.type = TRACE_GRAPH_RET; entry->ret = *trace; - ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; - unsigned long irq_flags; if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr, struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); - ftrace_trace_stack(tr, irq_flags, 4, pc); - ftrace_trace_userstack(tr, irq_flags, pc); + ring_buffer_unlock_commit(tr->buffer, event); + ftrace_trace_stack(tr, 0, 4, pc); + ftrace_trace_userstack(tr, 0, pc); trace_wake_up(); } @@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr, { struct ring_buffer_event *event; struct ctx_switch_entry *entry; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 5, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, { struct ring_buffer_event *event; struct ctx_switch_entry *entry; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) return; entry = ring_buffer_event_data(event); @@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_prio = wakee->prio; entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); @@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) trace_buf[len] = 0; size = sizeof(*entry) + len + 1; - event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, size); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: spin_unlock_irqrestore(&trace_buf_lock, irq_flags); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 1f07895977a0..4e08debf662d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { struct ring_buffer_event *event; struct trace_boot_call *entry; - unsigned long irq_flags; struct trace_array *tr = boot_trace; if (!tr || !pre_initcalls_finished) @@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); entry->ent.type = TRACE_BOOT_CALL; entry->boot_call = *bt; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { struct ring_buffer_event *event; struct trace_boot_ret *entry; - unsigned long irq_flags; struct trace_array *tr = boot_trace; if (!tr || !pre_initcalls_finished) @@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); entry->ent.type = TRACE_BOOT_RET; entry->boot_ret = *bt; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 027e83690615..770e52acfc10 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; - unsigned long flags, irq_flags; + unsigned long flags; int cpu, pc; const char *p; @@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; @@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index fff3545fc866..e720c001db2b 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to) struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; - unsigned long irq1, irq2; + unsigned long irq1; int cpu; if (unlikely(!tr)) @@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.cpu = cpu; entry->from = from; entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event, irq2); + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index ec78e244242e..104ddebc11d1 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, { struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) { atomic_inc(&dropped_count); return; @@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_RW; entry->rw = *rw; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr, { struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; - unsigned long irq_flags; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) { atomic_inc(&dropped_count); return; @@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, tracing_generic_entry_update(&entry->ent, 0, preempt_count()); entry->ent.type = TRACE_MMIO_MAP; entry->map = *map; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index faa6ab7a1f5c..3b1a292d12d2 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it) struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; - unsigned long irq_flags; struct trace_array *tr = power_trace; if (!trace_power_enabled) @@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it) it->end = ktime_get(); data = tr->data[smp_processor_id()]; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); entry->ent.type = TRACE_POWER; entry->state_data = *it; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type, struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; - unsigned long irq_flags; struct trace_array *tr = power_trace; if (!trace_power_enabled) @@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type, it->end = it->stamp; data = tr->data[smp_processor_id()]; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), - &irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); if (!event) goto out; entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, 0); entry->ent.type = TRACE_POWER; entry->state_data = *it; - ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); -- cgit v1.2.3-59-g8ed1b From 51a763dd84253bab1d0a1e68e11a7753d1b702ca Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Feb 2009 16:14:13 -0200 Subject: tracing: Introduce trace_buffer_{lock_reserve,unlock_commit} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: new API These new functions do what previously was being open coded, reducing the number of details ftrace plugin writers have to worry about. It also standardizes the handling of stacktrace, userstacktrace and other trace options we may introduce in the future. With this patch, for instance, the blk tracer (and some others already in the tree) can use the "userstacktrace" /d/tracing/trace_options facility. $ codiff /tmp/vmlinux.before /tmp/vmlinux.after linux-2.6-tip/kernel/trace/trace.c: trace_vprintk | -5 trace_graph_return | -22 trace_graph_entry | -26 trace_function | -45 __ftrace_trace_stack | -27 ftrace_trace_userstack | -29 tracing_sched_switch_trace | -66 tracing_stop | +1 trace_seq_to_user | -1 ftrace_trace_special | -63 ftrace_special | +1 tracing_sched_wakeup_trace | -70 tracing_reset_online_cpus | -1 13 functions changed, 2 bytes added, 355 bytes removed, diff: -353 linux-2.6-tip/block/blktrace.c: __blk_add_trace | -58 1 function changed, 58 bytes removed, diff: -58 linux-2.6-tip/kernel/trace/trace.c: trace_buffer_lock_reserve | +88 trace_buffer_unlock_commit | +86 2 functions changed, 174 bytes added, diff: +174 /tmp/vmlinux.after: 16 functions changed, 176 bytes added, 413 bytes removed, diff: -237 Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frédéric Weisbecker Signed-off-by: Ingo Molnar --- block/blktrace.c | 21 +++------ kernel/trace/kmemtrace.c | 19 +++----- kernel/trace/trace.c | 94 ++++++++++++++++++++++------------------ kernel/trace/trace.h | 11 +++++ kernel/trace/trace_boot.c | 20 +++------ kernel/trace/trace_branch.c | 7 ++- kernel/trace/trace_hw_branches.c | 7 ++- kernel/trace/trace_mmiotrace.c | 20 ++++----- kernel/trace/trace_power.c | 20 +++------ 9 files changed, 102 insertions(+), 117 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 8e52f24cc8f9..834cd84037b2 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -187,19 +187,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, cpu = raw_smp_processor_id(); if (blk_tr) { - struct trace_entry *ent; tracing_record_cmdline(current); - event = ring_buffer_lock_reserve(blk_tr->buffer, - sizeof(*t) + pdu_len); + pc = preempt_count(); + event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + sizeof(*t) + pdu_len, + 0, pc); if (!event) return; - - ent = ring_buffer_event_data(event); - t = (struct blk_io_trace *)ent; - pc = preempt_count(); - tracing_generic_entry_update(ent, 0, pc); - ent->type = TRACE_BLK; + t = ring_buffer_event_data(event); goto record_it; } @@ -241,12 +237,7 @@ record_it: memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); if (blk_tr) { - ring_buffer_unlock_commit(blk_tr->buffer, event); - if (pid != 0 && - !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) && - (trace_flags & TRACE_ITER_STACKTRACE) != 0) - __trace_stack(blk_tr, 0, 5, pc); - trace_wake_up(); + trace_buffer_unlock_commit(blk_tr, event, 0, pc); return; } } diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 256749d1032a..ae201b3eda89 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -276,13 +276,12 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, if (!kmem_tracing_enabled) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC, + sizeof(*entry), 0, 0); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_KMEM_ALLOC; entry->call_site = call_site; entry->ptr = ptr; entry->bytes_req = bytes_req; @@ -290,9 +289,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, 0, 0); } EXPORT_SYMBOL(kmemtrace_mark_alloc_node); @@ -307,20 +304,16 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id, if (!kmem_tracing_enabled) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE, + sizeof(*entry), 0, 0); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; entry->type_id = type_id; entry->call_site = call_site; entry->ptr = ptr; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, 0, 0); } EXPORT_SYMBOL(kmemtrace_mark_free); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eb453a238a6f..8fad3776e843 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -776,6 +776,39 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, + unsigned char type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(tr->buffer, len); + if (event != NULL) { + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; + } + + return event; +} +static void ftrace_trace_stack(struct trace_array *tr, + unsigned long flags, int skip, int pc); +static void ftrace_trace_userstack(struct trace_array *tr, + unsigned long flags, int pc); + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + ring_buffer_unlock_commit(tr->buffer, event); + + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); + trace_wake_up(); +} + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -788,12 +821,11 @@ trace_function(struct trace_array *tr, if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), + flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_FN; entry->ip = ip; entry->parent_ip = parent_ip; ring_buffer_unlock_commit(tr->buffer, event); @@ -811,12 +843,11 @@ static void __trace_graph_entry(struct trace_array *tr, if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_GRAPH_ENT; entry->graph_ent = *trace; ring_buffer_unlock_commit(global_trace.buffer, event); } @@ -832,12 +863,11 @@ static void __trace_graph_return(struct trace_array *tr, if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) return; - event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_GRAPH_RET; entry->ret = *trace; ring_buffer_unlock_commit(global_trace.buffer, event); } @@ -861,13 +891,11 @@ static void __ftrace_trace_stack(struct trace_array *tr, struct stack_entry *entry; struct stack_trace trace; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_STACK, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_STACK; - memset(&entry->caller, 0, sizeof(entry->caller)); trace.nr_entries = 0; @@ -908,12 +936,11 @@ static void ftrace_trace_userstack(struct trace_array *tr, if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_USER_STACK; memset(&entry->caller, 0, sizeof(entry->caller)); @@ -941,20 +968,15 @@ ftrace_trace_special(void *__tr, struct trace_array *tr = __tr; struct special_entry *entry; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, + sizeof(*entry), 0, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, pc); - entry->ent.type = TRACE_SPECIAL; entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, 0, 4, pc); - ftrace_trace_userstack(tr, 0, pc); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, 0, pc); } void @@ -973,12 +995,11 @@ tracing_sched_switch_trace(struct trace_array *tr, struct ring_buffer_event *event; struct ctx_switch_entry *entry; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_CTX, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_CTX; entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; entry->prev_state = prev->state; @@ -986,9 +1007,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 5, pc); - ftrace_trace_userstack(tr, flags, pc); + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1000,12 +1019,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ring_buffer_event *event; struct ctx_switch_entry *entry; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_WAKE, + sizeof(*entry), flags, pc); if (!event) return; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_WAKE; entry->prev_pid = curr->pid; entry->prev_prio = curr->prio; entry->prev_state = curr->state; @@ -1013,11 +1031,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_prio = wakee->prio; entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - ring_buffer_unlock_commit(tr->buffer, event); - ftrace_trace_stack(tr, flags, 6, pc); - ftrace_trace_userstack(tr, flags, pc); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -2825,12 +2839,10 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) trace_buf[len] = 0; size = sizeof(*entry) + len + 1; - event = ring_buffer_lock_reserve(tr->buffer, size); + event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, irq_flags, pc); - entry->ent.type = TRACE_PRINT; entry->ip = ip; entry->depth = depth; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index df627a948694..e03f157c772e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -403,6 +403,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); +struct ring_buffer_event; + +struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, + unsigned char type, + unsigned long len, + unsigned long flags, + int pc); +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc); + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 4e08debf662d..7a30fc4c3642 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -143,17 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, + sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_BOOT_CALL; entry->boot_call = *bt; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); - + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -170,17 +166,13 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) sprint_symbol(bt->func, (unsigned long)fn); preempt_disable(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, + sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_BOOT_RET; entry->boot_ret = *bt; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); - + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 770e52acfc10..48b2196abe37 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -52,14 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + pc = preempt_count(); + event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + sizeof(*entry), flags, pc); if (!event) goto out; - pc = preempt_count(); entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, flags, pc); - entry->ent.type = TRACE_BRANCH; /* Strip off the path, only save the file */ p = f->file + strlen(f->file); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e720c001db2b..2aa1c9f4c7d8 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -189,16 +189,15 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, from); - entry->ent.type = TRACE_HW_BRANCHES; entry->ent.cpu = cpu; entry->from = from; entry->to = to; - ring_buffer_unlock_commit(tr->buffer, event); + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 104ddebc11d1..c401b908e805 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,19 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, { struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; + int pc = preempt_count(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, + sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); return; } entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, preempt_count()); - entry->ent.type = TRACE_MMIO_RW; entry->rw = *rw; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, 0, pc); } void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -335,19 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr, { struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; + int pc = preempt_count(); - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, + sizeof(*entry), 0, pc); if (!event) { atomic_inc(&dropped_count); return; } entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, preempt_count()); - entry->ent.type = TRACE_MMIO_MAP; entry->map = *map; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); + trace_buffer_unlock_commit(tr, event, 0, pc); } void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 3b1a292d12d2..bfc21f8079ab 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -124,17 +124,13 @@ void trace_power_end(struct power_trace *it) it->end = ktime_get(); data = tr->data[smp_processor_id()]; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_POWER; entry->state_data = *it; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); - + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -159,17 +155,13 @@ void trace_power_mark(struct power_trace *it, unsigned int type, it->end = it->stamp; data = tr->data[smp_processor_id()]; - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); if (!event) goto out; entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - entry->ent.type = TRACE_POWER; entry->state_data = *it; - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); - + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } -- cgit v1.2.3-59-g8ed1b From b6f11df26fdc28324cf9c9e3b77f2dc985c1bb13 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 5 Feb 2009 18:02:00 -0200 Subject: trace: Call tracing_reset_online_cpus before tracer->init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup To make it easy for ftrace plugin writers, as this was open coded in the existing plugins Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Frédéric Weisbecker Signed-off-by: Ingo Molnar --- block/blktrace.c | 2 -- kernel/trace/trace.c | 8 +++++++- kernel/trace/trace.h | 1 + kernel/trace/trace_branch.c | 1 - kernel/trace/trace_functions.c | 17 +++-------------- kernel/trace/trace_functions_graph.c | 1 - kernel/trace/trace_hw_branches.c | 1 - kernel/trace/trace_nop.c | 1 - kernel/trace/trace_sched_switch.c | 8 +------- kernel/trace/trace_selftest.c | 18 +++++++++--------- kernel/trace/trace_sysprof.c | 14 ++++---------- 11 files changed, 25 insertions(+), 47 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index 834cd84037b2..ca6d32061e4f 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -1086,8 +1086,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); - mutex_lock(&blk_probe_mutex); if (atomic_add_return(1, &blk_probes_ref) == 1) if (blk_register_tracepoints()) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8fad3776e843..ef4dbac95568 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2171,6 +2171,12 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +int tracer_init(struct tracer *t, struct trace_array *tr) +{ + tracing_reset_online_cpus(tr); + return t->init(tr); +} + static int tracing_set_tracer(const char *buf) { struct trace_array *tr = &global_trace; @@ -2195,7 +2201,7 @@ static int tracing_set_tracer(const char *buf) current_trace = t; if (t->init) { - ret = t->init(tr); + ret = tracer_init(t, tr); if (ret) goto out; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e03f157c772e..f2742fb1575a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -395,6 +395,7 @@ struct trace_iterator { cpumask_var_t started; }; +int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 48b2196abe37..f8ae2c50e01d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -131,7 +131,6 @@ static void stop_branch_trace(struct trace_array *tr) static int branch_trace_init(struct trace_array *tr) { - tracing_reset_online_cpus(tr); start_branch_trace(tr); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d067cea2ccc3..36bf9568ccd9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -24,32 +24,21 @@ static struct trace_array *func_trace; static void tracing_start_function_trace(void); static void tracing_stop_function_trace(void); -static void start_function_trace(struct trace_array *tr) +static int function_trace_init(struct trace_array *tr) { func_trace = tr; tr->cpu = get_cpu(); - tracing_reset_online_cpus(tr); put_cpu(); tracing_start_cmdline_record(); tracing_start_function_trace(); -} - -static void stop_function_trace(struct trace_array *tr) -{ - tracing_stop_function_trace(); - tracing_stop_cmdline_record(); -} - -static int function_trace_init(struct trace_array *tr) -{ - start_function_trace(tr); return 0; } static void function_trace_reset(struct trace_array *tr) { - stop_function_trace(tr); + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); } static void function_trace_start(struct trace_array *tr) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c97594d826bc..222f97d336a6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -56,7 +56,6 @@ static int graph_trace_init(struct trace_array *tr) &trace_graph_entry); if (ret) return ret; - tracing_reset_online_cpus(tr); tracing_start_cmdline_record(); return 0; diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 2aa1c9f4c7d8..ca4bbcfb9e2c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -132,7 +132,6 @@ static int bts_trace_init(struct trace_array *tr) hw_branch_trace = tr; register_hotcpu_notifier(&bts_hotcpu_notifier); - tracing_reset_online_cpus(tr); bts_trace_start(tr); return 0; diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 087b6cbf4ea5..9aa84bde23cd 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -48,7 +48,6 @@ static void stop_nop_trace(struct trace_array *tr) static int nop_trace_init(struct trace_array *tr) { ctx_trace = tr; - tracing_reset_online_cpus(tr); start_nop_trace(tr); return 0; } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c4f9add5ec90..30e14fe85896 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -185,12 +185,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void start_sched_trace(struct trace_array *tr) -{ - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); -} - static void stop_sched_trace(struct trace_array *tr) { tracing_stop_sched_switch_record(); @@ -199,7 +193,7 @@ static void stop_sched_trace(struct trace_array *tr) static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; - start_sched_trace(tr); + tracing_start_sched_switch_record(); return 0; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5013812578b1..445700e51f6d 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -115,7 +115,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_set_filter(func_name, strlen(func_name), 1); /* enable tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); goto out; @@ -189,7 +189,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 1; tracer_enabled = 1; - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); goto out; @@ -236,7 +236,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return ret; @@ -290,7 +290,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) } /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return ret; @@ -344,7 +344,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); goto out; @@ -476,7 +476,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wait_for_completion(&isrt); /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return ret; @@ -537,7 +537,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr int ret; /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return ret; @@ -569,7 +569,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return 0; @@ -596,7 +596,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) int ret; /* start the tracing */ - ret = trace->init(tr); + ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); return ret; diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index eaca5ad803ff..84ca9d81e74d 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -226,15 +226,6 @@ static void stop_stack_timers(void) stop_stack_timer(cpu); } -static void start_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - tracing_reset_online_cpus(tr); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); -} - static void stop_stack_trace(struct trace_array *tr) { mutex_lock(&sample_timer_lock); @@ -247,7 +238,10 @@ static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; - start_stack_trace(tr); + mutex_lock(&sample_timer_lock); + start_stack_timers(); + tracer_enabled = 1; + mutex_unlock(&sample_timer_lock); return 0; } -- cgit v1.2.3-59-g8ed1b From 304cc6ae1bf7a8e6d00053fbe0b7e2b26cdddda2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Feb 2009 01:12:02 +0100 Subject: ring_buffer: remove unused flags parameter, fix Oprofile's ring-buffer use was not considered. Signed-off-by: Ingo Molnar --- drivers/oprofile/cpu_buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index e76d715e4342..f0e99d4c066b 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -161,7 +161,7 @@ struct op_sample { entry->event = ring_buffer_lock_reserve (op_ring_buffer_write, sizeof(struct op_sample) + - size * sizeof(entry->sample->data[0]), &entry->irq_flags); + size * sizeof(entry->sample->data[0])); if (entry->event) entry->sample = ring_buffer_event_data(entry->event); else @@ -178,8 +178,7 @@ struct op_sample int op_cpu_buffer_write_commit(struct op_entry *entry) { - return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, - entry->irq_flags); + return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event); } struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) -- cgit v1.2.3-59-g8ed1b From 1830b52d0de8c60c4f5dfbac134aa8f69d815801 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 7 Feb 2009 19:38:43 -0500 Subject: trace: remove deprecated entry->cpu Impact: fix to prevent developers from using entry->cpu With the new ring buffer infrastructure, the cpu for the entry is implicit with which CPU buffer it is on. The original code use to record the current cpu into the generic entry header, which can be retrieved by entry->cpu. When the ring buffer was introduced, the users were convert to use the the cpu number of which cpu ring buffer was in use (this was passed to the tracers by the iterator: iter->cpu). Unfortunately, the cpu item in the entry structure was never removed. This allowed for developers to use it instead of the proper iter->cpu, unknowingly, using an uninitialized variable. This was not the fault of the developers, since it would seem like the logical place to retrieve the cpu identifier. This patch removes the cpu item from the entry structure and fixes all the users that should have been using iter->cpu. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 1 - kernel/trace/trace_hw_branches.c | 3 +-- kernel/trace/trace_output.c | 6 +++--- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd51cf0b94c7..bd4d9f8818fa 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1531,7 +1531,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_CONTEXT_INFO) { SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); + SEQ_PUT_FIELD_RET(s, iter->cpu); SEQ_PUT_FIELD_RET(s, iter->ts); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f0c7a0f08cac..5efc4c707f7e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,7 +45,6 @@ enum trace_type { */ struct trace_entry { unsigned char type; - unsigned char cpu; unsigned char flags; unsigned char preempt_count; int pid; diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index fff3545fc866..549238a9b133 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -159,7 +159,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) trace_assign_type(it, entry); if (entry->type == TRACE_HW_BRANCHES) { - if (trace_seq_printf(seq, "%4d ", entry->cpu) && + if (trace_seq_printf(seq, "%4d ", iter->cpu) && seq_print_ip_sym(seq, it->to, symflags) && trace_seq_printf(seq, "\t <- ") && seq_print_ip_sym(seq, it->from, symflags) && @@ -195,7 +195,6 @@ void trace_hw_branch(u64 from, u64 to) entry = ring_buffer_event_data(event); tracing_generic_entry_update(&entry->ent, 0, from); entry->ent.type = TRACE_HW_BRANCHES; - entry->ent.cpu = cpu; entry->from = from; entry->to = to; ring_buffer_unlock_commit(tr->buffer, event, irq2); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b7380eee9fa1..463a310b1d3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter) unsigned long secs = (unsigned long)t; return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, entry->cpu, secs, usec_rem); + comm, entry->pid, iter->cpu, secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) @@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter) char *comm = trace_find_cmdline(entry->pid); ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, - entry->pid, entry->cpu, entry->flags, + entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx, ns2usecs(iter->ts), abs_usecs / USEC_PER_MSEC, @@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter) rel_usecs / USEC_PER_MSEC, rel_usecs % USEC_PER_MSEC); } else { - ret = lat_print_generic(s, entry, entry->cpu); + ret = lat_print_generic(s, entry, iter->cpu); if (ret) ret = lat_print_timestamp(s, abs_usecs, rel_usecs); } -- cgit v1.2.3-59-g8ed1b From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 18:43:07 -0500 Subject: ring-buffer: add NMI protection for spinlocks Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 1 + arch/x86/kernel/ftrace.c | 8 ++++---- include/linux/ftrace_irq.h | 10 +++++++++- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d1..a6be725cb049 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4d33224c055f..4c683587055b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -113,7 +113,7 @@ static void ftrace_mod_code(void) MCOUNT_INSN_SIZE); } -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); /* Must have in_nmi seen before reading write flag */ @@ -124,7 +124,7 @@ void ftrace_nmi_enter(void) } } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { /* Finish all executions before clearing in_nmi */ smp_wmb(); @@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void) */ static atomic_t in_nmi; -void ftrace_nmi_enter(void) +void arch_ftrace_nmi_enter(void) { atomic_inc(&in_nmi); } -void ftrace_nmi_exit(void) +void arch_ftrace_nmi_exit(void) { atomic_dec(&in_nmi); } diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 366a054d0b05..29de6779a963 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,15 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) +#ifdef CONFIG_FTRACE_NMI_ENTER +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + +#ifdef CONFIG_RING_BUFFER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d9..25131a5d5e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_FTRACE_NMI_ENTER + bool + config HAVE_FUNCTION_TRACER bool @@ -37,6 +40,11 @@ config TRACER_MAX_TRACE config RING_BUFFER bool +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + config TRACING bool select DEBUG_FS diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374ceef..a60a6a852f42 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -18,6 +19,35 @@ #include "trace.h" +/* + * Since the write to the buffer is still not fully lockless, + * we must be careful with NMIs. The locks in the writers + * are taken when a write crosses to a new page. The locks + * protect against races with the readers (this will soon + * be fixed with a lockless solution). + * + * Because we can not protect against NMIs, and we want to + * keep traces reentrant, we need to manage what happens + * when we are in an NMI. + */ +static DEFINE_PER_CPU(int, rb_in_nmi); + +void ftrace_nmi_enter(void) +{ + __get_cpu_var(rb_in_nmi)++; + /* call arch specific handler too */ + arch_ftrace_nmi_enter(); +} + +void ftrace_nmi_exit(void) +{ + arch_ftrace_nmi_exit(); + __get_cpu_var(rb_in_nmi)--; + /* NMIs are not recursive */ + WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); +} + + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; unsigned long flags; + bool lock_taken = false; commit_page = cpu_buffer->commit_page; /* we just need to protect against interrupts */ @@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page = tail_page; local_irq_save(flags); - __raw_spin_lock(&cpu_buffer->lock); + /* + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) + goto out_unlock; + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; rb_inc_page(cpu_buffer, &next_page); @@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); - __raw_spin_unlock(&cpu_buffer->lock); + if (likely(lock_taken)) + __raw_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); return NULL; } -- cgit v1.2.3-59-g8ed1b From d8b891a2db13c8ed296158d6f8c4e335896d0cef Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 19:54:51 -0500 Subject: ring-buffer: allow tracing_off to be used in core kernel code tracing_off() is the fastest way to stop recording to the ring buffers. This may be used in places like panic and die, just before the ftrace_dump is called. This patch adds the appropriate CPP conditionals to make it a stub function when the ring buffer is not configured it. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b3b359660082..ac94c066f6e9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -124,9 +124,18 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +/* + * The below functions are fine to use outside the tracing facility. + */ +#ifdef CONFIG_RING_BUFFER void tracing_on(void); void tracing_off(void); void tracing_off_permanent(void); +#else +static inline void tracing_on(void) { } +static inline void tracing_off(void) { } +static inline void tracing_off_permanent(void) { } +#endif void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); -- cgit v1.2.3-59-g8ed1b From 4e6ea1440c67de32d7c89aacf233472dfc3bce82 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Feb 2009 22:30:07 -0500 Subject: ftrace, x86: rename in_nmi variable Impact: clean up The in_nmi variable in x86 arch ftrace.c is a misnomer. Andrew Morton pointed out that the in_nmi variable is incremented by all CPUS. It can be set when another CPU is running an NMI. Since this is actually intentional, the fix is to rename it to what it really is: "nmi_running" Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4c683587055b..e3fad2ef622c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -82,7 +82,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ -static atomic_t in_nmi = ATOMIC_INIT(0); +static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ static int mod_code_write; /* set when NMI should do the write */ static void *mod_code_ip; /* holds the IP to write to */ @@ -115,8 +115,8 @@ static void ftrace_mod_code(void) void arch_ftrace_nmi_enter(void) { - atomic_inc(&in_nmi); - /* Must have in_nmi seen before reading write flag */ + atomic_inc(&nmi_running); + /* Must have nmi_running seen before reading write flag */ smp_mb(); if (mod_code_write) { ftrace_mod_code(); @@ -126,19 +126,19 @@ void arch_ftrace_nmi_enter(void) void arch_ftrace_nmi_exit(void) { - /* Finish all executions before clearing in_nmi */ + /* Finish all executions before clearing nmi_running */ smp_wmb(); - atomic_dec(&in_nmi); + atomic_dec(&nmi_running); } static void wait_for_nmi(void) { - if (!atomic_read(&in_nmi)) + if (!atomic_read(&nmi_running)) return; do { cpu_relax(); - } while(atomic_read(&in_nmi)); + } while (atomic_read(&nmi_running)); nmi_wait_count++; } @@ -374,16 +374,16 @@ int ftrace_disable_ftrace_graph_caller(void) * this page for dynamic ftrace. They have been * simplified to ignore all traces in NMI context. */ -static atomic_t in_nmi; +static atomic_t nmi_running; void arch_ftrace_nmi_enter(void) { - atomic_inc(&in_nmi); + atomic_inc(&nmi_running); } void arch_ftrace_nmi_exit(void) { - atomic_dec(&in_nmi); + atomic_dec(&nmi_running); } #endif /* !CONFIG_DYNAMIC_FTRACE */ @@ -475,7 +475,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (unlikely(atomic_read(&in_nmi))) + if (unlikely(atomic_read(&nmi_running))) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) -- cgit v1.2.3-59-g8ed1b From 375b38b4214f29109a393ab762d468054bf52354 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 00:51:37 -0500 Subject: nmi: add generic nmi tracking state This code adds an in_nmi() macro that uses the current tasks preempt count to track when it is in NMI context. Other parts of the kernel can use this to determine if the context is in NMI context or not. This code was inspired by the -rt patch in_nmi version that was written by Peter Zijlstra, who borrowed that code from Mathieu Desnoyers. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- include/linux/hardirq.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f83288347dda..f3cf86e1465b 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -61,6 +61,12 @@ #error PREEMPT_ACTIVE is too low! #endif +#define NMI_OFFSET (PREEMPT_ACTIVE << 1) + +#if NMI_OFFSET >= 0x80000000 +#error PREEMPT_ACTIVE too high! +#endif + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) @@ -73,6 +79,11 @@ #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) +/* + * Are we in NMI context? + */ +#define in_nmi() (preempt_count() & NMI_OFFSET) + #if defined(CONFIG_PREEMPT) # define PREEMPT_INATOMIC_BASE kernel_locked() # define PREEMPT_CHECK_OFFSET 1 @@ -167,6 +178,8 @@ extern void irq_exit(void); #define nmi_enter() \ do { \ ftrace_nmi_enter(); \ + BUG_ON(in_nmi()); \ + add_preempt_count(NMI_OFFSET); \ lockdep_off(); \ rcu_nmi_enter(); \ __irq_enter(); \ @@ -177,6 +190,8 @@ extern void irq_exit(void); __irq_exit(); \ rcu_nmi_exit(); \ lockdep_on(); \ + BUG_ON(!in_nmi()); \ + sub_preempt_count(NMI_OFFSET); \ ftrace_nmi_exit(); \ } while (0) -- cgit v1.2.3-59-g8ed1b From 9a5fd902273d01170fd033691bd70b142baa7309 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 01:14:26 -0500 Subject: ftrace: change function graph tracer to use new in_nmi The function graph tracer piggy backed onto the dynamic ftracer to use the in_nmi custom code for dynamic tracing. The problem was (as Andrew Morton pointed out) it really only wanted to bail out if the context of the current CPU was in NMI context. But the dynamic ftrace in_nmi custom code was true if _any_ CPU happened to be in NMI context. Now that we have a generic in_nmi interface, this patch changes the function graph code to use it instead of the dynamic ftarce custom code. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- arch/x86/Kconfig | 2 +- arch/x86/kernel/ftrace.c | 21 +-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a6be725cb049..2cf7bbcaed4e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,7 @@ config X86 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER + select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e3fad2ef622c..918073c6681b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -367,25 +367,6 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, old_offset, new_offset); } -#else /* CONFIG_DYNAMIC_FTRACE */ - -/* - * These functions are picked from those used on - * this page for dynamic ftrace. They have been - * simplified to ignore all traces in NMI context. - */ -static atomic_t nmi_running; - -void arch_ftrace_nmi_enter(void) -{ - atomic_inc(&nmi_running); -} - -void arch_ftrace_nmi_exit(void) -{ - atomic_dec(&nmi_running); -} - #endif /* !CONFIG_DYNAMIC_FTRACE */ /* Add a function return address to the trace stack on thread info.*/ @@ -475,7 +456,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) &return_to_handler; /* Nmi's are currently unsupported */ - if (unlikely(atomic_read(&nmi_running))) + if (unlikely(in_nmi())) return; if (unlikely(atomic_read(¤t->tracing_graph_pause))) -- cgit v1.2.3-59-g8ed1b From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Feb 2009 01:45:16 -0500 Subject: ring-buffer: use generic version of in_nmi Impact: clean up Now that a generic in_nmi is available, this patch removes the special code in the ring_buffer and implements the in_nmi generic version instead. With this change, I was also able to rename the "arch_ftrace_nmi_enter" back to "ftrace_nmi_enter" and remove the code from the ring buffer. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++-- include/linux/ftrace_irq.h | 8 -------- kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------ 3 files changed, 15 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 918073c6681b..d74d75e0952d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -113,7 +113,7 @@ static void ftrace_mod_code(void) MCOUNT_INSN_SIZE); } -void arch_ftrace_nmi_enter(void) +void ftrace_nmi_enter(void) { atomic_inc(&nmi_running); /* Must have nmi_running seen before reading write flag */ @@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void) } } -void arch_ftrace_nmi_exit(void) +void ftrace_nmi_exit(void) { /* Finish all executions before clearing nmi_running */ smp_wmb(); diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 29de6779a963..dca7bf8cffe2 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -3,14 +3,6 @@ #ifdef CONFIG_FTRACE_NMI_ENTER -extern void arch_ftrace_nmi_enter(void); -extern void arch_ftrace_nmi_exit(void); -#else -static inline void arch_ftrace_nmi_enter(void) { } -static inline void arch_ftrace_nmi_exit(void) { } -#endif - -#ifdef CONFIG_RING_BUFFER extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a60a6a852f42..5ee344417cd5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -19,35 +20,6 @@ #include "trace.h" -/* - * Since the write to the buffer is still not fully lockless, - * we must be careful with NMIs. The locks in the writers - * are taken when a write crosses to a new page. The locks - * protect against races with the readers (this will soon - * be fixed with a lockless solution). - * - * Because we can not protect against NMIs, and we want to - * keep traces reentrant, we need to manage what happens - * when we are in an NMI. - */ -static DEFINE_PER_CPU(int, rb_in_nmi); - -void ftrace_nmi_enter(void) -{ - __get_cpu_var(rb_in_nmi)++; - /* call arch specific handler too */ - arch_ftrace_nmi_enter(); -} - -void ftrace_nmi_exit(void) -{ - arch_ftrace_nmi_exit(); - __get_cpu_var(rb_in_nmi)--; - /* NMIs are not recursive */ - WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); -} - - /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers @@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, local_irq_save(flags); /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * * NMIs can happen after we take the lock. * If we are in an NMI, only take the lock * if it is not already taken. Otherwise * simply fail. */ - if (unlikely(__get_cpu_var(rb_in_nmi))) { + if (unlikely(in_nmi())) { if (!__raw_spin_trylock(&cpu_buffer->lock)) goto out_unlock; } else -- cgit v1.2.3-59-g8ed1b From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Fri, 6 Feb 2009 17:33:27 +0800 Subject: trace: trivial fixes in comment typos. Impact: clean up Fixed several typos in the comments. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 6 +++--- kernel/trace/trace.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7840e718c6c7..5e302d636fc2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; } #endif /** - * ftrace_make_nop - convert code into top + * ftrace_make_nop - convert code into nop * @mod: module structure if called by module load initialization * @rec: the mcount call site record * @addr: the address that the call site should be calling diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 68610031780b..1796e018fbff 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) * it is not enabled then do nothing. * * If this record is not to be traced and - * it is enabled then disabled it. + * it is enabled then disable it. * */ if (rec->flags & FTRACE_FL_NOTRACE) { @@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) return 0; - /* Record is not filtered and is not enabled do nothing */ + /* Record is not filtered or enabled, do nothing */ if (!fl) return 0; @@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) } else { - /* if record is not enabled do nothing */ + /* if record is not enabled, do nothing */ if (!(rec->flags & FTRACE_FL_ENABLED)) return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5efc4c707f7e..f92aba52a894 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -616,12 +616,12 @@ extern struct tracer nop_trace; * preempt_enable (after a disable), a schedule might take place * causing an infinite recursion. * - * To prevent this, we read the need_recshed flag before + * To prevent this, we read the need_resched flag before * disabling preemption. When we want to enable preemption we * check the flag, if it is set, then we call preempt_enable_no_resched. * Otherwise, we call preempt_enable. * - * The rational for doing the above is that if need resched is set + * The rational for doing the above is that if need_resched is set * and we have yet to reschedule, we are either in an atomic location * (where we do not need to check for scheduling) or we are inside * the scheduler and do not want to resched. @@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void) * * This is a scheduler safe way to enable preemption and not miss * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we were either inside an atomic or + * If resched is set, then we are either inside an atomic or * are inside the scheduler (we would have already scheduled * otherwise). In this case, we do not want to call normal * preempt_enable, but preempt_enable_no_resched instead. -- cgit v1.2.3-59-g8ed1b From 4346cdd4647e5eef15817dbfc2c091cac55e33d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 8 Feb 2009 21:51:14 +0100 Subject: xfs: cleanup xfs_find_handle Remove the superflous igrab by keeping a reference on the path/file all the time and clean up various bits of surrounding code. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_ioctl.c | 106 ++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 62 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 4bd112313f33..6f04493b8aba 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -78,92 +78,74 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; + struct file *file = NULL; + struct path path; + int error; + struct xfs_inode *ip; - memset((char *)&handle, 0, sizeof(handle)); - - switch (cmd) { - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_PATH_TO_HANDLE: { - struct path path; - int error = user_lpath((const char __user *)hreq->path, &path); + if (cmd == XFS_IOC_FD_TO_HANDLE) { + file = fget(hreq->fd); + if (!file) + return -EBADF; + inode = file->f_path.dentry->d_inode; + } else { + error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; - - ASSERT(path.dentry); - ASSERT(path.dentry->d_inode); - inode = igrab(path.dentry->d_inode); - path_put(&path); - break; + inode = path.dentry->d_inode; } + ip = XFS_I(inode); - case XFS_IOC_FD_TO_HANDLE: { - struct file *file; - - file = fget(hreq->fd); - if (!file) - return -EBADF; + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; - ASSERT(file->f_path.dentry); - ASSERT(file->f_path.dentry->d_inode); - inode = igrab(file->f_path.dentry->d_inode); - fput(file); - break; - } + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; - default: - ASSERT(0); - return -XFS_ERROR(EINVAL); - } - if (inode->i_sb->s_magic != XFS_SB_MAGIC) { - /* we're not in XFS anymore, Toto */ - iput(inode); - return -XFS_ERROR(EINVAL); - } + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - break; - default: - iput(inode); - return -XFS_ERROR(EBADF); - } - - /* now we can grab the fsid */ - memcpy(&handle.ha_fsid, XFS_I(inode)->i_mount->m_fixedfsid, - sizeof(xfs_fsid_t)); - hsize = sizeof(xfs_fsid_t); - - if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { - xfs_inode_t *ip = XFS_I(inode); + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { int lock_mode; - /* need to get access to the xfs_inode to read the generation */ lock_mode = xfs_ilock_map_shared(ip); - - /* fill in fid section of handle from inode */ handle.ha_fid.fid_len = sizeof(xfs_fid_t) - sizeof(handle.ha_fid.fid_len); handle.ha_fid.fid_pad = 0; handle.ha_fid.fid_gen = ip->i_d.di_gen; handle.ha_fid.fid_ino = ip->i_ino; - xfs_iunlock_map_shared(ip, lock_mode); hsize = XFS_HSIZE(handle); } - /* now copy our handle into the user buffer & write out the size */ + error = -EFAULT; if (copy_to_user(hreq->ohandle, &handle, hsize) || - copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) { - iput(inode); - return -XFS_ERROR(EFAULT); - } + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; - iput(inode); - return 0; + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fput(file); + else + path_put(&path); + return error; } /* -- cgit v1.2.3-59-g8ed1b From 8e9b6e7fa4544ea8a0e030c8987b918509c8ff47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 8 Feb 2009 21:51:42 +0100 Subject: xfs: remove the unused XFS_QMOPT_DQLOCK flag The XFS_QMOPT_DQLOCK flag introduces major complexity in the quota subsystem but isn't actually used anywhere. So remove it and all the hazzles it introduces. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/quota/xfs_qm.c | 146 +++++++++++++---------------------------- fs/xfs/quota/xfs_trans_dquot.c | 16 ++--- fs/xfs/xfs_quota.h | 1 - 3 files changed, 50 insertions(+), 113 deletions(-) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 7a2beb64314f..fd0b383d72a5 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -632,7 +632,6 @@ xfs_qm_dqattach_one( xfs_dqid_t id, uint type, uint doalloc, - uint dolock, xfs_dquot_t *udqhint, /* hint */ xfs_dquot_t **IO_idqpp) { @@ -641,16 +640,16 @@ xfs_qm_dqattach_one( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; + /* * See if we already have it in the inode itself. IO_idqpp is * &i_udquot or &i_gdquot. This made the code look weird, but * made the logic a lot simpler. */ - if ((dqp = *IO_idqpp)) { - if (dolock) - xfs_dqlock(dqp); + dqp = *IO_idqpp; + if (dqp) { xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); - goto done; + return 0; } /* @@ -659,38 +658,38 @@ xfs_qm_dqattach_one( * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside * the user dquot. */ - ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - if (udqhint && !dolock) + if (udqhint) { + ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); xfs_dqlock(udqhint); - /* - * No need to take dqlock to look at the id. - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we hold - * the ilock. - */ - if (udqhint && - (dqp = udqhint->q_gdquot) && - (be32_to_cpu(dqp->q_core.d_id) == id)) { - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); - ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - if (!dolock) { + /* + * No need to take dqlock to look at the id. + * + * The ID can't change until it gets reclaimed, and it won't + * be reclaimed as long as we have a ref from inode and we + * hold the ilock. + */ + dqp = udqhint->q_gdquot; + if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) { + xfs_dqlock(dqp); + XFS_DQHOLD(dqp); + ASSERT(*IO_idqpp == NULL); + *IO_idqpp = dqp; + xfs_dqunlock(dqp); xfs_dqunlock(udqhint); + return 0; } - goto done; - } - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - if (udqhint) + + /* + * We can't hold a dquot lock when we call the dqget code. + * We'll deadlock in no time, because of (not conforming to) + * lock ordering - the inodelock comes before any dquot lock, + * and we may drop and reacquire the ilock in xfs_qm_dqget(). + */ xfs_dqunlock(udqhint); + } + /* * Find the dquot from somewhere. This bumps the * reference count of dquot and returns it locked. @@ -698,48 +697,19 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, - doalloc|XFS_QMOPT_DOWARN, &dqp))) { - if (udqhint && dolock) - xfs_dqlock(udqhint); - goto done; - } + error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); + /* * dqget may have dropped and re-acquired the ilock, but it guarantees * that the dquot returned is the one that should go in the inode. */ *IO_idqpp = dqp; - ASSERT(dqp); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! dolock) { - xfs_dqunlock(dqp); - goto done; - } - if (! udqhint) - goto done; - - ASSERT(udqhint); - ASSERT(dolock); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! xfs_qm_dqlock_nowait(udqhint)) { - xfs_dqunlock(dqp); - xfs_dqlock(udqhint); - xfs_dqlock(dqp); - } - done: -#ifdef QUOTADEBUG - if (udqhint) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - } - if (! error) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - } -#endif - return error; + xfs_dqunlock(dqp); + return 0; } @@ -754,24 +724,15 @@ xfs_qm_dqattach_one( STATIC void xfs_qm_dqattach_grouphint( xfs_dquot_t *udq, - xfs_dquot_t *gdq, - uint locked) + xfs_dquot_t *gdq) { xfs_dquot_t *tmp; -#ifdef QUOTADEBUG - if (locked) { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - } -#endif - if (! locked) - xfs_dqlock(udq); + xfs_dqlock(udq); if ((tmp = udq->q_gdquot)) { if (tmp == gdq) { - if (! locked) - xfs_dqunlock(udq); + xfs_dqunlock(udq); return; } @@ -781,8 +742,6 @@ xfs_qm_dqattach_grouphint( * because the freelist lock comes before dqlocks. */ xfs_dqunlock(udq); - if (locked) - xfs_dqunlock(gdq); /* * we took a hard reference once upon a time in dqget, * so give it back when the udquot no longer points at it @@ -795,9 +754,7 @@ xfs_qm_dqattach_grouphint( } else { ASSERT(XFS_DQ_IS_LOCKED(udq)); - if (! locked) { - xfs_dqlock(gdq); - } + xfs_dqlock(gdq); } ASSERT(XFS_DQ_IS_LOCKED(udq)); @@ -810,10 +767,9 @@ xfs_qm_dqattach_grouphint( XFS_DQHOLD(gdq); udq->q_gdquot = gdq; } - if (! locked) { - xfs_dqunlock(gdq); - xfs_dqunlock(udq); - } + + xfs_dqunlock(gdq); + xfs_dqunlock(udq); } @@ -821,8 +777,6 @@ xfs_qm_dqattach_grouphint( * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. - * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty - * much made this code a complete mess, but it has been pretty useful. * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. @@ -851,7 +805,6 @@ xfs_qm_dqattach( if (XFS_IS_UQUOTA_ON(mp)) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, NULL, &ip->i_udquot); if (error) goto done; @@ -863,11 +816,9 @@ xfs_qm_dqattach( error = XFS_IS_GQUOTA_ON(mp) ? xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, ip->i_udquot, &ip->i_gdquot) : xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, ip->i_udquot, &ip->i_gdquot); /* * Don't worry about the udquot that we may have @@ -898,22 +849,13 @@ xfs_qm_dqattach( /* * Attach i_gdquot to the gdquot hint inside the i_udquot. */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, - flags & XFS_QMOPT_DQLOCK); + xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot); } done: #ifdef QUOTADEBUG if (! error) { - if (ip->i_udquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot)); - } - if (ip->i_gdquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot)); - } if (XFS_IS_UQUOTA_ON(mp)) ASSERT(ip->i_udquot); if (XFS_IS_OQUOTA_ON(mp)) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 99611381e740..447173bcf96d 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -624,10 +624,9 @@ xfs_trans_dqresv( xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqlock(dqp); - } - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + xfs_dqlock(dqp); + if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (!hardlimit) @@ -740,10 +739,8 @@ xfs_trans_dqresv( ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); error_return: - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqunlock(dqp); - } - return (error); + xfs_dqunlock(dqp); + return error; } @@ -753,8 +750,7 @@ error_return: * grp/prj quotas is important, because this follows a both-or-nothing * approach. * - * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. - * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 48965ecaa155..63d41acf4533 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -185,7 +185,6 @@ typedef struct xfs_qoff_logformat { * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */ #define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -- cgit v1.2.3-59-g8ed1b From 7153f8ba2b18f18f661d5cb172782bcae2eadce9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:36:46 +0100 Subject: xfs: remove iclog calculation special cases Our default has been to always use 8 32KB log buffers for a while now, so remove the special casing for larger block size filesystem to use the same or even lower number of buffers. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f4726f702a9e..493c07f6a99a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1164,32 +1164,8 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, log->l_iclog_hsize = BBSIZE; log->l_iclog_heads = 1; - /* - * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use - * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers. - */ - if (mp->m_sb.sb_blocksize >= 16*1024) { - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - if (mp->m_logbufs <= 0) { - switch (mp->m_sb.sb_blocksize) { - case 16*1024: /* 16 KB */ - log->l_iclog_bufs = 3; - break; - case 32*1024: /* 32 KB */ - log->l_iclog_bufs = 4; - break; - case 64*1024: /* 64 KB */ - log->l_iclog_bufs = 8; - break; - default: - xlog_panic("XFS: Invalid blocksize"); - break; - } - } - } - -done: /* are we being asked to make the sizes selected above visible? */ +done: + /* are we being asked to make the sizes selected above visible? */ if (mp->m_logbufs == 0) mp->m_logbufs = log->l_iclog_bufs; if (mp->m_logbsize == 0) -- cgit v1.2.3-59-g8ed1b From 0d87e656dd961145f47ede5e37eceecfdc7d8197 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:37:14 +0100 Subject: xfs: remove superflous inobt macros xfs_ialloc_btree.h has a a cuple of macros that only obsfucate the code but don't provide any abstraction benefits. This patches removes those and cleans up the reamaining defintions up a little. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_ialloc.c | 10 +++++----- fs/xfs/xfs_ialloc_btree.h | 22 ++++------------------ fs/xfs/xfs_mount.h | 2 +- fs/xfs/xfs_trans.h | 12 ++++++------ fs/xfs/xfs_trans_space.h | 2 +- 5 files changed, 17 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ab016e5ae7be..62d1ceae0943 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -230,7 +230,7 @@ xfs_ialloc_ag_alloc( args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; /* Allow space for the inode btree to split. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } else @@ -270,7 +270,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -943,7 +943,7 @@ nextag: ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); - XFS_INOBT_CLR_FREE(&rec, offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) @@ -1105,11 +1105,11 @@ xfs_difree( */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); - ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ - XFS_INOBT_SET_FREE(&rec, off); + rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 5580e255ff06..f782ad0c4769 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -32,14 +32,14 @@ struct xfs_mount; #define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ typedef __uint64_t xfs_inofree_t; -#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) #define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) -#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { - return (((n) >= XFS_INODES_PER_CHUNK ? \ - (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i); + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* @@ -68,20 +68,6 @@ typedef struct xfs_inobt_key { /* btree pointer type */ typedef __be32 xfs_inobt_ptr_t; -/* - * Bit manipulations for ir_free. - */ -#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) -#define XFS_INOBT_IS_FREE(rp,i) \ - (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) -#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) -#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) - -/* - * Maximum number of inode btree levels. - */ -#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) - /* * block numbers in the AG. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 670c10e098a0..de9beed9dce1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -273,7 +273,7 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ - uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ + uint m_in_maxlevels; /* max inobt btree levels. */ struct xfs_perag *m_perag; /* per-ag accounting info */ struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ struct mutex m_growlock; /* growfs mutex */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index d6fe4a88d79f..166f728bea70 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -343,7 +343,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ (128 * 5) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) @@ -449,9 +449,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ (2 * (mp)->m_sb.sb_sectsize + \ XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) @@ -481,9 +481,9 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ (3 * (mp)->m_sb.sb_sectsize + \ XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ + XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) #define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) @@ -513,7 +513,7 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ (128 * 5) + \ XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \ XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 4ea2e5074bdd..7d2c920dfb9c 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -47,7 +47,7 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) + (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1) /* * Space reservation values for various transactions. -- cgit v1.2.3-59-g8ed1b From a568778739030fb68805dda1af2f4ebbc3adad7d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:37:39 +0100 Subject: xfs: remove uchar_t/ushort_t/uint_t/ulong_t types Just another set of types obsfucating the code, remove them. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/xfs_ag.h | 4 ++-- fs/xfs/xfs_da_btree.c | 2 +- fs/xfs/xfs_da_btree.h | 6 +++--- fs/xfs/xfs_log_priv.h | 2 +- fs/xfs/xfs_log_recover.c | 4 ++-- fs/xfs/xfs_types.h | 8 -------- 7 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index faf3aa3ca154..712fa2950875 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -180,7 +180,7 @@ xfs_parseargs( int dswidth = 0; int iosize = 0; int dmapi_implies_ikeep = 1; - uchar_t iosizelog = 0; + __uint8_t iosizelog = 0; /* * Copy binary VFS mount flags we are interested in. diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 143d63ecb20a..c8641f713caa 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -223,8 +223,8 @@ typedef struct xfs_perag be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) #define XFS_MIN_FREELIST_PAG(pag,mp) \ (XFS_MIN_FREELIST_RAW( \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index c45f74ff1a5b..9ff6e57a5075 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1503,7 +1503,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * This is implemented with some source-level loop unrolling. */ xfs_dahash_t -xfs_da_hashname(const uchar_t *name, int namelen) +xfs_da_hashname(const __uint8_t *name, int namelen) { xfs_dahash_t hash; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 70b710c1792d..9d5a7e8c38fe 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -91,9 +91,9 @@ enum xfs_dacmp { * Structure to ease passing around component names. */ typedef struct xfs_da_args { - const uchar_t *name; /* string (maybe not NULL terminated) */ + const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ - uchar_t *value; /* set of bytes (maybe contain NULLs) */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ xfs_dahash_t hashval; /* hash value of name */ @@ -251,7 +251,7 @@ xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, xfs_dabuf_t *dead_buf); -uint xfs_da_hashname(const uchar_t *name_string, int name_length); +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, const char *name, int len); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 654167be0efb..3670d48fd774 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -359,7 +359,7 @@ typedef struct xlog_in_core { int ic_size; int ic_offset; int ic_bwritecnt; - ushort_t ic_state; + unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ #ifdef XFS_LOG_TRACE struct ktrace *ic_trace; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1047de2fffd..504d540e0e2c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -211,11 +211,11 @@ xlog_header_check_dump( cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); + cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&mp->m_sb.sb_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); cmn_err(CE_DEBUG, " log : uuid = "); for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); + cmn_err(CE_DEBUG, "%02x", ((__uint8_t *)&head->h_fs_uuid)[b]); cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); } #else diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b2f724502f1b..d725428c9df6 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -20,14 +20,6 @@ #ifdef __KERNEL__ -/* - * POSIX Extensions - */ -typedef unsigned char uchar_t; -typedef unsigned short ushort_t; -typedef unsigned int uint_t; -typedef unsigned long ulong_t; - /* * Additional type declarations for XFS */ -- cgit v1.2.3-59-g8ed1b From 517b5e8c8516a25a0df3b530fd183eb493a96698 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:38:02 +0100 Subject: xfs: merge xfs_mkdir into xfs_create xfs_create and xfs_mkdir only have minor differences, so merge both of them into a sigle function. While we're at it also make the error handling code more straight-forward. Signed-off-by: Christoph Hellwig Dave Chinner --- fs/xfs/linux-2.6/xfs_iops.c | 30 ++-- fs/xfs/xfs_vnodeops.c | 351 +++++++++++--------------------------------- fs/xfs/xfs_vnodeops.h | 2 - 3 files changed, 92 insertions(+), 291 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 7aa53fefc67f..e103b05dc777 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -211,8 +211,13 @@ xfs_vn_mknod( * Irix uses Missed'em'V split, but doesn't want to see * the upper 5 bits of (14bit) major. */ - if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) - return -EINVAL; + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } if (test_default_acl && test_default_acl(dir)) { if (!_ACL_ALLOC(default_acl)) { @@ -224,28 +229,11 @@ xfs_vn_mknod( } } - xfs_dentry_to_name(&name, dentry); - if (IS_POSIXACL(dir) && !default_acl) mode &= ~current->fs->umask; - switch (mode & S_IFMT) { - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: - rdev = sysv_encode_dev(rdev); - case S_IFREG: - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); - break; - case S_IFDIR: - error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL); - break; - default: - error = EINVAL; - break; - } - + xfs_dentry_to_name(&name, dentry); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); if (unlikely(error)) goto out_free_acl; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index bc0a0a75b1d6..59de04954bc8 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1387,23 +1387,28 @@ xfs_create( xfs_inode_t **ipp, cred_t *credp) { - xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *ip; - xfs_trans_t *tp; + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; boolean_t unlock_dp_on_error = B_FALSE; - int dm_event_sent = 0; uint cancel_flags; int committed; xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; uint resblks; + uint log_res; + uint log_count; - ASSERT(!*ipp); xfs_itrace_entry(dp); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, dp, DM_RIGHT_NULL, NULL, @@ -1412,84 +1417,97 @@ xfs_create( if (error) return error; - dm_event_sent = 1; } - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* Return through std_return after this point. */ - - udqp = gdqp = NULL; if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) prid = dp->i_d.di_projid; else - prid = (xfs_prid_t)dfltprid; + prid = dfltprid; /* * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; - ip = NULL; + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + log_res = XFS_MKDIR_LOG_RES(mp); + log_count = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); + } else { + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + log_res = XFS_CREATE_LOG_RES(mp); + log_count = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); + } - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_CREATE_SPACE_RES(mp, name->len); + /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + error = xfs_trans_reserve(tp, resblks, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + error = xfs_trans_reserve(tp, 0, log_res, 0, + XFS_TRANS_PERM_LOG_RES, log_count); } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = B_TRUE; - xfs_bmap_init(&free_list, &first_block); + /* + * Check for directory link count overflow. + */ + if (is_dir && dp->i_d.di_nlink >= XFS_MAXLINK) { + error = XFS_ERROR(EMLINK); + goto out_trans_cancel; + } - ASSERT(ip == NULL); + xfs_bmap_init(&free_list, &first_block); /* * Reserve disk quota and the inode. */ error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; error = xfs_dir_canenter(tp, dp, name, resblks); if (error) - goto error_return; - error = xfs_dir_ialloc(&tp, dp, mode, 1, - rdev, credp, prid, resblks > 0, - &ip, &committed); + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, credp, + prid, resblks > 0, &ip, &committed); if (error) { if (error == ENOSPC) - goto error_return; - goto abort_return; + goto out_trans_cancel; + goto out_trans_abort; } - xfs_itrace_ref(ip); /* * At this point, we've gotten a newly allocated inode. * It is locked (and joined to the transaction). */ - + xfs_itrace_ref(ip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* @@ -1508,19 +1526,28 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != ENOSPC); - goto abort_return; + goto out_trans_abort; } xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; + + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } /* * Attach the dquot(s) to the inodes and modify them incore. @@ -1537,16 +1564,13 @@ xfs_create( IHOLD(ip); error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_rele; - } + if (error) + goto out_abort_rele; error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) { IRELE(ip); - tp = NULL; - goto error_return; + goto out_dqrele; } XFS_QM_DQRELE(mp, udqp); @@ -1555,26 +1579,22 @@ xfs_create( *ipp = ip; /* Fallthrough to std_return with error = 0 */ - -std_return: - if ((*ipp || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dp, DM_RIGHT_NULL, - *ipp ? ip : NULL, - DM_RIGHT_NULL, name->name, NULL, - mode, error, 0); + std_return: + if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { + XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, dp, DM_RIGHT_NULL, + ip, DM_RIGHT_NULL, name->name, NULL, mode, + error, 0); } + return error; - abort_return: + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - if (tp != NULL) - xfs_trans_cancel(tp, cancel_flags); - + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_dqrele: XFS_QM_DQRELE(mp, udqp); XFS_QM_DQRELE(mp, gdqp); @@ -1583,20 +1603,18 @@ std_return: goto std_return; - abort_rele: + out_abort_rele: /* * Wait until after the current transaction is aborted to * release the inode. This prevents recursive transactions * and deadlocks from xfs_inactive. */ + xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; xfs_trans_cancel(tp, cancel_flags); IRELE(ip); - - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - goto std_return; + unlock_dp_on_error = B_FALSE; + goto out_dqrele; } #ifdef DEBUG @@ -2112,209 +2130,6 @@ std_return: goto std_return; } - -int -xfs_mkdir( - xfs_inode_t *dp, - struct xfs_name *dir_name, - mode_t mode, - xfs_inode_t **ipp, - cred_t *credp) -{ - xfs_mount_t *mp = dp->i_mount; - xfs_inode_t *cdp; /* inode of created dir */ - xfs_trans_t *tp; - int cancel_flags; - int error; - int committed; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t unlock_dp_on_error = B_FALSE; - boolean_t created = B_FALSE; - int dm_event_sent = 0; - xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - tp = NULL; - - if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, dir_name->name, NULL, - mode, 0, 0); - if (error) - return error; - dm_event_sent = 1; - } - - /* Return through std_return after this point. */ - - xfs_itrace_entry(dp); - - mp = dp->i_mount; - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; - else - prid = (xfs_prid_t)dfltprid; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(), current_fsgid(), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len); - error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_MKDIR_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); - unlock_dp_on_error = B_TRUE; - - /* - * Check for directory link count overflow. - */ - if (dp->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - goto error_return; - } - - /* - * Reserve disk quota and the inode. - */ - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - error = xfs_dir_canenter(tp, dp, dir_name, resblks); - if (error) - goto error_return; - /* - * create the directory inode. - */ - error = xfs_dir_ialloc(&tp, dp, mode, 2, - 0, credp, prid, resblks > 0, - &cdp, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto abort_return; - } - xfs_itrace_ref(cdp); - - /* - * Now we add the directory inode to the transaction. - * We waited until now since xfs_dir_ialloc might start - * a new transaction. Had we joined the transaction - * earlier, the locks might have gotten released. An error - * from here on will result in the transaction cancel - * unlocking dp so don't do it explicitly in the error path. - */ - IHOLD(dp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - unlock_dp_on_error = B_FALSE; - - xfs_bmap_init(&free_list, &first_block); - - error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto error1; - } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - error = xfs_dir_init(tp, cdp, dp); - if (error) - goto error2; - - error = xfs_bumplink(tp, dp); - if (error) - goto error2; - - created = B_TRUE; - - *ipp = cdp; - IHOLD(cdp); - - /* - * Attach the dquots to the new inode and modify the icount incore. - */ - XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp); - - /* - * If this is a synchronous mount, make sure that the - * mkdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - IRELE(cdp); - goto error2; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - if (error) { - IRELE(cdp); - } - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit. */ - -std_return: - if ((created || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dp, DM_RIGHT_NULL, - created ? cdp : NULL, - DM_RIGHT_NULL, - dir_name->name, NULL, - mode, error, 0); - } - return error; - - error2: - error1: - xfs_bmap_cancel(&free_list); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - goto std_return; -} - int xfs_symlink( xfs_inode_t *dp, diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 2258df3fae84..04373c6c61ff 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -31,8 +31,6 @@ int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name, - mode_t mode, struct xfs_inode **ipp, cred_t *credp); int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, xfs_off_t *offset, filldir_t filldir); int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, -- cgit v1.2.3-59-g8ed1b From e249458220c9799fe94573abd341d29c83579671 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:38:39 +0100 Subject: xfs: remove XFS_QM_LOCK/XFS_QM_UNLOCK/XFS_QM_HOLD/XFS_QM_RELE Remove these macros which only obsfucated the code in rather nast ways. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_qm.c | 13 ++++++------- fs/xfs/quota/xfs_qm.h | 5 ----- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index fd0b383d72a5..246790ad9589 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -219,7 +219,7 @@ xfs_qm_hold_quotafs_ref( * the structure could disappear between the entry to this routine and * a HOLD operation if not locked. */ - XFS_QM_LOCK(xfs_Gqm); + mutex_lock(&xfs_Gqm_lock); if (xfs_Gqm == NULL) xfs_Gqm = xfs_Gqm_init(); @@ -228,8 +228,8 @@ xfs_qm_hold_quotafs_ref( * debugging and statistical purposes, but ... * Just take a reference and get out. */ - XFS_QM_HOLD(xfs_Gqm); - XFS_QM_UNLOCK(xfs_Gqm); + xfs_Gqm->qm_nrefs++; + mutex_unlock(&xfs_Gqm_lock); return 0; } @@ -277,13 +277,12 @@ xfs_qm_rele_quotafs_ref( * Destroy the entire XQM. If somebody mounts with quotaon, this'll * be restarted. */ - XFS_QM_LOCK(xfs_Gqm); - XFS_QM_RELE(xfs_Gqm); - if (xfs_Gqm->qm_nrefs == 0) { + mutex_lock(&xfs_Gqm_lock); + if (--xfs_Gqm->qm_nrefs == 0) { xfs_qm_destroy(xfs_Gqm); xfs_Gqm = NULL; } - XFS_QM_UNLOCK(xfs_Gqm); + mutex_unlock(&xfs_Gqm_lock); } /* diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index ddf09166387c..d48c4dbe1428 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -158,11 +158,6 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock)) -#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) -#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) -#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) - extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); extern void xfs_qm_mount_quotas(xfs_mount_t *); extern int xfs_qm_quotacheck(xfs_mount_t *); -- cgit v1.2.3-59-g8ed1b From 7201813bf55cc06e6a7405831f63df96ee7842e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:39:24 +0100 Subject: xfs: use mutex_is_locked in XFS_DQ_IS_LOCKED Now that we have a helper to test if a mutex is held use it instead of our own little hacks. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_dquot.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index d443e93b4331..d1f726e0e5a4 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -109,19 +109,6 @@ enum { #define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) -#ifdef DEBUG -static inline int -XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) -{ - if (mutex_trylock(&dqp->q_qlock)) { - mutex_unlock(&dqp->q_qlock); - return 0; - } - return 1; -} -#endif - - /* * Manage the q_flush completion queue embedded in the dquot. This completion * queue synchronizes processes attempting to flush the in-core dquot back to @@ -142,6 +129,7 @@ static inline void xfs_dqfunlock(xfs_dquot_t *dqp) complete(&dqp->q_flush); } +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) #define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) -- cgit v1.2.3-59-g8ed1b From c9a192dcf906a33f59c555924e7796a4b9454217 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:47:22 +0100 Subject: xfs: sanitize qh_lock wrappers Get rid of various obsfucating wrappers for accessing the quota hash lock, we only keep the accessors for accessing the mplist and freelist locks as they encode a multi-level datastructure walk. But make sure all of them are defined in the same way as simple macros. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/quota/xfs_dquot.c | 28 ++++++++++++------------- fs/xfs/quota/xfs_qm.c | 49 +++++++------------------------------------ fs/xfs/quota/xfs_qm.h | 1 - fs/xfs/quota/xfs_quota_priv.h | 40 +++++++++++++---------------------- 4 files changed, 37 insertions(+), 81 deletions(-) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 6543c0b29753..e4babcc63423 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -804,7 +804,7 @@ xfs_qm_dqlookup( uint flist_locked; xfs_dquot_t *d; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); flist_locked = B_FALSE; @@ -877,7 +877,7 @@ xfs_qm_dqlookup( /* * move the dquot to the front of the hashchain */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); if (dqp->HL_PREVP != &qh->qh_next) { xfs_dqtrace_entry(dqp, "DQLOOKUP: HASH MOVETOFRONT"); @@ -892,13 +892,13 @@ xfs_qm_dqlookup( } xfs_dqtrace_entry(dqp, "LOOKUP END"); *O_dqpp = dqp; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); return (0); } } *O_dqpp = NULL; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); + ASSERT(mutex_is_locked(&qh->qh_lock)); return (1); } @@ -956,7 +956,7 @@ xfs_qm_dqget( ASSERT(ip->i_gdquot == NULL); } #endif - XFS_DQ_HASH_LOCK(h); + mutex_lock(&h->qh_lock); /* * Look in the cache (hashtable). @@ -971,7 +971,7 @@ xfs_qm_dqget( */ ASSERT(*O_dqpp); ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); return (0); /* success */ } @@ -991,7 +991,7 @@ xfs_qm_dqget( * we don't keep the lock across a disk read */ version = h->qh_version; - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); /* * Allocate the dquot on the kernel heap, and read the ondisk @@ -1056,7 +1056,7 @@ xfs_qm_dqget( /* * Hashlock comes after ilock in lock order */ - XFS_DQ_HASH_LOCK(h); + mutex_lock(&h->qh_lock); if (version != h->qh_version) { xfs_dquot_t *tmpdqp; /* @@ -1072,7 +1072,7 @@ xfs_qm_dqget( * and start over. */ xfs_qm_dqput(tmpdqp); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); xfs_qm_dqdestroy(dqp); XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); goto again; @@ -1083,7 +1083,7 @@ xfs_qm_dqget( * Put the dquot at the beginning of the hash-chain and mp's list * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); + ASSERT(mutex_is_locked(&h->qh_lock)); dqp->q_hash = h; XQM_HASHLIST_INSERT(h, dqp); @@ -1102,7 +1102,7 @@ xfs_qm_dqget( XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_UNLOCK(h); + mutex_unlock(&h->qh_lock); dqret: ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); xfs_dqtrace_entry(dqp, "DQGET DONE"); @@ -1440,7 +1440,7 @@ xfs_qm_dqpurge( xfs_mount_t *mp = dqp->q_mount; ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); + ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock)); xfs_dqlock(dqp); /* @@ -1453,7 +1453,7 @@ xfs_qm_dqpurge( */ if (dqp->q_nrefs != 0) { xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); return (1); } @@ -1517,7 +1517,7 @@ xfs_qm_dqpurge( memset(&dqp->q_core, 0, sizeof(dqp->q_core)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(thishash); + mutex_unlock(&thishash->qh_lock); return (0); } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 246790ad9589..11d0a4f89a0e 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -69,8 +69,6 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC void xfs_qm_freelist_init(xfs_frlist_t *); STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); -STATIC int xfs_qm_mplist_nowait(xfs_mount_t *); -STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); @@ -576,10 +574,10 @@ xfs_qm_dqpurge_int( continue; } - if (! xfs_qm_dqhashlock_nowait(dqp)) { + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { nrecl = XFS_QI_MPLRECLAIMS(mp); xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_LOCK(dqp->q_hash); + mutex_lock(&dqp->q_hash->qh_lock); xfs_qm_mplist_lock(mp); /* @@ -589,7 +587,7 @@ xfs_qm_dqpurge_int( * this point, but somebody might be taking things off. */ if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); goto again; } } @@ -2027,7 +2025,7 @@ xfs_qm_shake_freelist( * a dqlookup process that holds the hashlock that is * waiting for the freelist lock. */ - if (! xfs_qm_dqhashlock_nowait(dqp)) { + if (!mutex_trylock(&dqp->q_hash->qh_lock)) { xfs_dqfunlock(dqp); xfs_dqunlock(dqp); dqp = dqp->dq_flnext; @@ -2044,7 +2042,7 @@ xfs_qm_shake_freelist( /* XXX put a sentinel so that we can come back here */ xfs_dqfunlock(dqp); xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(hash); + mutex_unlock(&hash->qh_lock); xfs_qm_freelist_unlock(xfs_Gqm); if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) return nreclaimed; @@ -2061,7 +2059,7 @@ xfs_qm_shake_freelist( XQM_HASHLIST_REMOVE(hash, dqp); xfs_dqfunlock(dqp); xfs_qm_mplist_unlock(dqp->q_mount); - XFS_DQ_HASH_UNLOCK(hash); + mutex_unlock(&hash->qh_lock); off_freelist: XQM_FREELIST_REMOVE(dqp); @@ -2203,7 +2201,7 @@ xfs_qm_dqreclaim_one(void) continue; } - if (! xfs_qm_dqhashlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_hash->qh_lock)) goto mplistunlock; ASSERT(dqp->q_nrefs == 0); @@ -2212,7 +2210,7 @@ xfs_qm_dqreclaim_one(void) XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); XQM_FREELIST_REMOVE(dqp); dqpout = dqp; - XFS_DQ_HASH_UNLOCK(dqp->q_hash); + mutex_unlock(&dqp->q_hash->qh_lock); mplistunlock: xfs_qm_mplist_unlock(dqp->q_mount); xfs_dqfunlock(dqp); @@ -2715,34 +2713,3 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) { xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); } - -STATIC int -xfs_qm_dqhashlock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = mutex_trylock(&((dqp)->q_hash->qh_lock)); - return locked; -} - -int -xfs_qm_freelist_lock_nowait( - xfs_qm_t *xqm) -{ - int locked; - - locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock)); - return locked; -} - -STATIC int -xfs_qm_mplist_nowait( - xfs_mount_t *mp) -{ - int locked; - - ASSERT(mp->m_quotainfo); - locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp))); - return locked; -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index d48c4dbe1428..5ef08fb7562f 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -189,7 +189,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, /* list stuff */ extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); extern void xfs_qm_freelist_unlink(xfs_dquot_t *); -extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); /* system call interface */ extern int xfs_qm_quotactl(struct xfs_mount *, int, int, diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index c4fcea600bc2..8286b2842b6b 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -42,34 +42,24 @@ #define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) #define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock) #define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) #define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) -#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) -#ifdef DEBUG -struct xfs_dqhash; -static inline int XQMISLCKD(struct xfs_dqhash *h) -{ - if (mutex_trylock(&h->qh_lock)) { - mutex_unlock(&h->qh_lock); - return 0; - } - return 1; -} -#endif - -#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) -#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h) -#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h) - -#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp))) -#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp))) -#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp))) - -#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) -#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) +#define xfs_qm_mplist_lock(mp) \ + mutex_lock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define xfs_qm_mplist_nowait(mp) \ + mutex_trylock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define xfs_qm_mplist_unlock(mp) \ + mutex_unlock(&(XFS_QI_MPL_LIST(mp).qh_lock)) +#define XFS_QM_IS_MPLIST_LOCKED(mp) \ + mutex_is_locked(&(XFS_QI_MPL_LIST(mp).qh_lock)) + +#define xfs_qm_freelist_lock(qm) \ + mutex_lock(&((qm)->qm_dqfreelist.qh_lock)) +#define xfs_qm_freelist_lock_nowait(qm) \ + mutex_trylock(&((qm)->qm_dqfreelist.qh_lock)) +#define xfs_qm_freelist_unlock(qm) \ + mutex_unlock(&((qm)->qm_dqfreelist.qh_lock)) /* * Hash into a bucket in the dquot hash table, based on . -- cgit v1.2.3-59-g8ed1b From fcafb71b57a039f2113b0321b3b5535fea3a0aca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Feb 2009 08:47:34 +0100 Subject: xfs: get rid of indirections in the quotaops implementation Currently we call from the nicely abstracted linux quotaops into a ugly multiplexer just to split the calls out at the same boundary again. Rewrite the quota ops handling to remove that obfucation. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/Makefile | 1 + fs/xfs/linux-2.6/xfs_linux.h | 11 --- fs/xfs/linux-2.6/xfs_quotaops.c | 157 +++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_super.c | 62 +------------ fs/xfs/linux-2.6/xfs_super.h | 1 + fs/xfs/linux-2.6/xfs_sync.h | 1 + fs/xfs/quota/xfs_qm.h | 14 ++- fs/xfs/quota/xfs_qm_bhv.c | 1 - fs/xfs/quota/xfs_qm_syscalls.c | 188 ++-------------------------------------- fs/xfs/xfs_mount.h | 4 - fs/xfs/xfs_qmops.c | 1 - fs/xfs/xfs_quota.h | 2 + 12 files changed, 181 insertions(+), 262 deletions(-) create mode 100644 fs/xfs/linux-2.6/xfs_quotaops.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index c3dc491fff89..60f107e47fe9 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -33,6 +33,7 @@ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ xfs_qm_syscalls.o \ xfs_qm_bhv.o \ xfs_qm.o) +xfs-$(CONFIG_XFS_QUOTA) += linux-2.6/xfs_quotaops.o ifeq ($(CONFIG_XFS_QUOTA),y) xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 507492d6dccd..fc8d776ba05b 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -147,17 +147,6 @@ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) -/* - * IRIX (BSD) quotactl makes use of separate commands for user/group, - * whereas on Linux the syscall encodes this information into the cmd - * field (see the QCMD macro in quota.h). These macros help keep the - * code portable - they are not visible from the syscall interface. - */ -#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */ -#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */ -#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */ -#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */ - #define dfltprid 0 #define MAXPATHLEN 1024 diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c new file mode 100644 index 000000000000..94d9a633d3d9 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_dmapi.h" +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "quota/xfs_qm.h" +#include + + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +STATIC int +xfs_fs_quota_sync( + struct super_block *sb, + int type) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_sync_inodes(mp, SYNC_DELWRI); +} + +STATIC int +xfs_fs_get_xstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstat(mp, fqs); +} + +STATIC int +xfs_fs_set_xstate( + struct super_block *sb, + unsigned int uflags, + int op) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (uflags & XFS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & XFS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & XFS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & XFS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) + flags |= XFS_OQUOTA_ENFD; + + switch (op) { + case Q_XQUOTAON: + return -xfs_qm_scall_quotaon(mp, flags); + case Q_XQUOTAOFF: + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_quotaoff(mp, flags); + case Q_XQUOTARM: + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_trunc_qfiles(mp, flags); + } + + return -EINVAL; +} + +STATIC int +xfs_fs_get_xquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq); +} + +STATIC int +xfs_fs_set_xquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); +} + +struct quotactl_ops xfs_quotactl_operations = { + .quota_sync = xfs_fs_quota_sync, + .get_xstate = xfs_fs_get_xstate, + .set_xstate = xfs_fs_set_xstate, + .get_xquota = xfs_fs_get_xquota, + .set_xquota = xfs_fs_set_xquota, +}; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 712fa2950875..bc1e64708e2b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -68,7 +68,6 @@ #include #include -static struct quotactl_ops xfs_quotactl_operations; static struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; @@ -1333,57 +1332,6 @@ xfs_fs_show_options( return -xfs_showargs(XFS_M(mnt->mnt_sb), m); } -STATIC int -xfs_fs_quotasync( - struct super_block *sb, - int type) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XQUOTASYNC, 0, NULL); -} - -STATIC int -xfs_fs_getxstate( - struct super_block *sb, - struct fs_quota_stat *fqs) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), Q_XGETQSTAT, 0, (caddr_t)fqs); -} - -STATIC int -xfs_fs_setxstate( - struct super_block *sb, - unsigned int flags, - int op) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), op, 0, (caddr_t)&flags); -} - -STATIC int -xfs_fs_getxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), - (type == USRQUOTA) ? Q_XGETQUOTA : - ((type == GRPQUOTA) ? Q_XGETGQUOTA : - Q_XGETPQUOTA), id, (caddr_t)fdq); -} - -STATIC int -xfs_fs_setxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -XFS_QM_QUOTACTL(XFS_M(sb), - (type == USRQUOTA) ? Q_XSETQLIM : - ((type == GRPQUOTA) ? Q_XSETGQLIM : - Q_XSETPQLIM), id, (caddr_t)fdq); -} - /* * This function fills in xfs_mount_t fields based on mount args. * Note: the superblock _has_ now been read in. @@ -1466,7 +1414,9 @@ xfs_fs_fill_super( sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA sb->s_qcop = &xfs_quotactl_operations; +#endif sb->s_op = &xfs_super_operations; error = xfs_dmops_get(mp); @@ -1609,14 +1559,6 @@ static struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, }; -static struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quotasync, - .get_xstate = xfs_fs_getxstate, - .set_xstate = xfs_fs_setxstate, - .get_xquota = xfs_fs_getxquota, - .set_xquota = xfs_fs_setxquota, -}; - static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index d5d776d4cd67..5a2ea3a21781 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -93,6 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; extern struct xattr_handler *xfs_xattr_handlers[]; +extern struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 5f6de1efe1f6..04f058c848ae 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -19,6 +19,7 @@ #define XFS_SYNC_H 1 struct xfs_mount; +struct xfs_perag; typedef struct bhv_vfs_sync_work { struct list_head w_list; diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 5ef08fb7562f..933df4204fc7 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -173,6 +173,16 @@ extern void xfs_qm_dqdetach(xfs_inode_t *); extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); +extern int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, + fs_disk_quota_t *); +extern int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); +extern int xfs_qm_scall_quotaon(xfs_mount_t *, uint); +extern int xfs_qm_scall_quotaoff(xfs_mount_t *, uint); + /* vop stuff */ extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, uid_t, gid_t, prid_t, uint, @@ -190,10 +200,6 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); extern void xfs_qm_freelist_unlink(xfs_dquot_t *); -/* system call interface */ -extern int xfs_qm_quotactl(struct xfs_mount *, int, int, - xfs_caddr_t); - #ifdef DEBUG extern int xfs_qm_internalqcheck(xfs_mount_t *); #else diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index bc6c5cca3e12..63037c689a4b 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -235,7 +235,6 @@ struct xfs_qmops xfs_qmcore_xfs = { .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, .xfs_dqstatvfs = xfs_qm_statvfs, .xfs_dqsync = xfs_qm_sync, - .xfs_quotactl = xfs_qm_quotactl, .xfs_dqtrxops = &xfs_trans_dquot_ops, }; EXPORT_SYMBOL(xfs_qmcore_xfs); diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 68139b38aede..b00c8d484aa9 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -57,134 +57,15 @@ # define qdprintk(s, args...) do { } while (0) #endif -STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t); STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); -STATIC uint xfs_qm_import_flags(uint); STATIC uint xfs_qm_export_flags(uint); -STATIC uint xfs_qm_import_qtype_flags(uint); STATIC uint xfs_qm_export_qtype_flags(uint); STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, fs_disk_quota_t *); -/* - * The main distribution switch of all XFS quotactl system calls. - */ -int -xfs_qm_quotactl( - xfs_mount_t *mp, - int cmd, - int id, - xfs_caddr_t addr) -{ - int error; - - ASSERT(addr != NULL || cmd == Q_XQUOTASYNC); - - /* - * The following commands are valid even when quotaoff. - */ - switch (cmd) { - case Q_XQUOTARM: - /* - * Truncate quota files. quota must be off. - */ - if (XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(EINVAL); - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_trunc_qfiles(mp, - xfs_qm_import_qtype_flags(*(uint *)addr))); - - case Q_XGETQSTAT: - /* - * Get quota status information. - */ - return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr)); - - case Q_XQUOTAON: - /* - * QUOTAON - enabling quota enforcement. - * Quota accounting must be turned on at mount time. - */ - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_quotaon(mp, - xfs_qm_import_flags(*(uint *)addr))); - - case Q_XQUOTAOFF: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - break; - - case Q_XQUOTASYNC: - return xfs_sync_inodes(mp, SYNC_DELWRI); - - default: - break; - } - - if (! XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(ESRCH); - - switch (cmd) { - case Q_XQUOTAOFF: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_quotaoff(mp, - xfs_qm_import_flags(*(uint *)addr), - B_FALSE); - break; - - case Q_XGETQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XGETGQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XGETPQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - case Q_XSETQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XSETGQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XSETPQLIM: - if (mp->m_flags & XFS_MOUNT_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - default: - error = XFS_ERROR(EINVAL); - break; - } - - return (error); -} - /* * Turn off quota accounting and/or enforcement for all udquots and/or * gdquots. Called only at unmount time. @@ -193,11 +74,10 @@ xfs_qm_quotactl( * incore, and modifies the ondisk dquot directly. Therefore, for example, * it is an error to call this twice, without purging the cache. */ -STATIC int +int xfs_qm_scall_quotaoff( xfs_mount_t *mp, - uint flags, - boolean_t force) + uint flags) { uint dqtype; int error; @@ -205,8 +85,6 @@ xfs_qm_scall_quotaoff( xfs_qoff_logitem_t *qoffstart; int nculprits; - if (!force && !capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ @@ -375,7 +253,7 @@ out_error: return (error); } -STATIC int +int xfs_qm_scall_trunc_qfiles( xfs_mount_t *mp, uint flags) @@ -383,8 +261,6 @@ xfs_qm_scall_trunc_qfiles( int error = 0, error2 = 0; xfs_inode_t *qip; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); return XFS_ERROR(EINVAL); @@ -416,7 +292,7 @@ xfs_qm_scall_trunc_qfiles( * effect immediately. * (Switching on quota accounting must be done at mount time.) */ -STATIC int +int xfs_qm_scall_quotaon( xfs_mount_t *mp, uint flags) @@ -426,9 +302,6 @@ xfs_qm_scall_quotaon( uint accflags; __int64_t sbflags; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* * Switching on quota accounting must be done at mount time. @@ -517,7 +390,7 @@ xfs_qm_scall_quotaon( /* * Return quota status information, such as uquota-off, enforcements, etc. */ -STATIC int +int xfs_qm_scall_getqstat( xfs_mount_t *mp, fs_quota_stat_t *out) @@ -582,7 +455,7 @@ xfs_qm_scall_getqstat( /* * Adjust quota limits, and start/stop timers accordingly. */ -STATIC int +int xfs_qm_scall_setqlim( xfs_mount_t *mp, xfs_dqid_t id, @@ -595,9 +468,6 @@ xfs_qm_scall_setqlim( int error; xfs_qcnt_t hard, soft; - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) return (0); @@ -742,7 +612,7 @@ xfs_qm_scall_setqlim( return error; } -STATIC int +int xfs_qm_scall_getquota( xfs_mount_t *mp, xfs_dqid_t id, @@ -934,30 +804,6 @@ xfs_qm_export_dquot( #endif } -STATIC uint -xfs_qm_import_qtype_flags( - uint uflags) -{ - uint oflags = 0; - - /* - * Can't be more than one, or none. - */ - if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0)) - return (0); - - oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0; - oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0; - oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0; - return oflags; -} - STATIC uint xfs_qm_export_qtype_flags( uint flags) @@ -978,26 +824,6 @@ xfs_qm_export_qtype_flags( XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; } -STATIC uint -xfs_qm_import_flags( - uint uflags) -{ - uint flags = 0; - - if (uflags & XFS_QUOTA_UDQ_ACCT) - flags |= XFS_UQUOTA_ACCT; - if (uflags & XFS_QUOTA_PDQ_ACCT) - flags |= XFS_PQUOTA_ACCT; - if (uflags & XFS_QUOTA_GDQ_ACCT) - flags |= XFS_GQUOTA_ACCT; - if (uflags & XFS_QUOTA_UDQ_ENFD) - flags |= XFS_UQUOTA_ENFD; - if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; - return (flags); -} - - STATIC uint xfs_qm_export_flags( uint flags) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index de9beed9dce1..1438dd4dc909 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -136,7 +136,6 @@ typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, uint); typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *); typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags); -typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t); typedef struct xfs_qmops { xfs_qminit_t xfs_qminit; @@ -154,7 +153,6 @@ typedef struct xfs_qmops { xfs_dqvopchownresv_t xfs_dqvopchownresv; xfs_dqstatvfs_t xfs_dqstatvfs; xfs_dqsync_t xfs_dqsync; - xfs_quotactl_t xfs_quotactl; struct xfs_dqtrxops *xfs_dqtrxops; } xfs_qmops_t; @@ -188,8 +186,6 @@ typedef struct xfs_qmops { (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp) #define XFS_QM_DQSYNC(mp, flags) \ (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags) -#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \ - (*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr) #ifdef HAVE_PERCPU_SB diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c index 27f80581520a..e101790ea8e7 100644 --- a/fs/xfs/xfs_qmops.c +++ b/fs/xfs/xfs_qmops.c @@ -126,7 +126,6 @@ static struct xfs_qmops xfs_qmcore_stub = { .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, .xfs_dqstatvfs = (xfs_dqstatvfs_t) fs_noval, .xfs_dqsync = (xfs_dqsync_t) fs_noerr, - .xfs_quotactl = (xfs_quotactl_t) fs_nosys, }; int diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 63d41acf4533..f5d1202dde25 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -18,6 +18,8 @@ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ +struct xfs_trans; + /* * The ondisk form of a dquot structure. */ -- cgit v1.2.3-59-g8ed1b From 2db270a80b8f2238e536876cfb3987af02684df8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Feb 2009 20:46:45 +0100 Subject: tracing/blktrace: move the tracing file to kernel/trace Impact: cleanup Move blktrace.c to kernel/trace, also move its config entry. Signed-off-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Acked-by: Jens Axboe Signed-off-by: Ingo Molnar --- block/Kconfig | 24 - block/Makefile | 1 - block/blktrace.c | 1538 ----------------------------------------------- kernel/trace/Kconfig | 23 + kernel/trace/Makefile | 1 + kernel/trace/blktrace.c | 1538 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1562 insertions(+), 1563 deletions(-) delete mode 100644 block/blktrace.c create mode 100644 kernel/trace/blktrace.c diff --git a/block/Kconfig b/block/Kconfig index 7cdaa1d72252..e7d12782bcfb 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -44,30 +44,6 @@ config LBD If unsure, say N. -config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" - depends on SYSFS - select RELAY - select DEBUG_FS - select TRACEPOINTS - select TRACING - select STACKTRACE - help - Say Y here if you want to be able to trace the block layer actions - on a given queue. Tracing allows you to see any traffic happening - on a block device queue. For more information (and the userspace - support tools needed), fetch the blktrace tools from: - - git://git.kernel.dk/blktrace.git - - Tracing also is possible using the ftrace interface, e.g.: - - echo 1 > /sys/block/sda/sda1/trace/enable - echo blk > /sys/kernel/debug/tracing/current_tracer - cat /sys/kernel/debug/tracing/trace_pipe - - If unsure, say N. - config BLK_DEV_BSG bool "Block layer SG support v4 (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/block/Makefile b/block/Makefile index bfe73049f939..e9fa4dd690f2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blktrace.c b/block/blktrace.c deleted file mode 100644 index ca6d32061e4f..000000000000 --- a/block/blktrace.c +++ /dev/null @@ -1,1538 +0,0 @@ -/* - * Copyright (C) 2006 Jens Axboe - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include <../kernel/trace/trace_output.h> - -static unsigned int blktrace_seq __read_mostly = 1; - -static struct trace_array *blk_tr; -static int __read_mostly blk_tracer_enabled; - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_BLK_OPT_CLASSIC 0x1 - -static struct tracer_opt blk_tracer_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, - { } -}; - -static struct tracer_flags blk_tracer_flags = { - .val = 0, - .opts = blk_tracer_opts, -}; - -/* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); -static atomic_t blk_probes_ref = ATOMIC_INIT(0); - -static int blk_register_tracepoints(void); -static void blk_unregister_tracepoints(void); - -/* - * Send out a notify message. - */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) -{ - struct blk_io_trace *t; - - if (!bt->rchan) - return; - - t = relay_reserve(bt->rchan, sizeof(*t) + len); - if (t) { - const int cpu = smp_processor_id(); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); - t->device = bt->dev; - t->action = action; - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); - } -} - -/* - * Send out a notify for this process, if we haven't done so since a trace - * started - */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) -{ - tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); -} - -static void trace_note_time(struct blk_trace *bt) -{ - struct timespec now; - unsigned long flags; - u32 words[2]; - - getnstimeofday(&now); - words[0] = now.tv_sec; - words[1] = now.tv_nsec; - - local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); - local_irq_restore(flags); -} - -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) -{ - int n; - va_list args; - unsigned long flags; - char *buf; - - if (blk_tr) { - va_start(args, fmt); - ftrace_vprintk(fmt, args); - va_end(args); - return; - } - - if (!bt->msg_data) - return; - - local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); - va_start(args, fmt); - n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); - va_end(args); - - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(__trace_note_message); - -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, - pid_t pid) -{ - if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) - return 1; - if (sector < bt->start_lba || sector > bt->end_lba) - return 1; - if (bt->pid && pid != bt->pid) - return 1; - - return 0; -} - -/* - * Data direction bit lookup - */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; - -/* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) - -/* - * The worker for the various blk_add_trace*() types. Fills out a - * blk_io_trace structure and places it in a per-cpu subbuffer. - */ -static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) -{ - struct task_struct *tsk = current; - struct ring_buffer_event *event = NULL; - struct blk_io_trace *t; - unsigned long flags = 0; - unsigned long *sequence; - pid_t pid; - int cpu, pc = 0; - - if (unlikely(bt->trace_state != Blktrace_running || - !blk_tracer_enabled)) - return; - - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, AHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - - pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) - return; - cpu = raw_smp_processor_id(); - - if (blk_tr) { - tracing_record_cmdline(current); - - pc = preempt_count(); - event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, - sizeof(*t) + pdu_len, - 0, pc); - if (!event) - return; - t = ring_buffer_event_data(event); - goto record_it; - } - - /* - * A word about the locking here - we disable interrupts to reserve - * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. - */ - local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len; - - if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - - if (blk_tr) { - trace_buffer_unlock_commit(blk_tr, event, 0, pc); - return; - } - } - - local_irq_restore(flags); -} - -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); - -static void blk_trace_cleanup(struct blk_trace *bt) -{ - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); - relay_close(bt->rchan); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - kfree(bt); - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); -} - -int blk_trace_remove(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (!bt) - return -EINVAL; - - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) - blk_trace_cleanup(bt); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_remove); - -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct blk_trace *bt = filp->private_data; - char buf[16]; - - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); - - return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); -} - -static const struct file_operations blk_dropped_fops = { - .owner = THIS_MODULE, - .open = blk_dropped_open, - .read = blk_dropped_read, -}; - -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *msg; - struct blk_trace *bt; - - if (count > BLK_TN_MAX_MSG) - return -EINVAL; - - msg = kmalloc(count, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } - - bt = filp->private_data; - __trace_note_message(bt, "%s", msg); - kfree(msg); - - return count; -} - -static const struct file_operations blk_msg_fops = { - .owner = THIS_MODULE, - .open = blk_msg_open, - .write = blk_msg_write, -}; - -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - -static int blk_remove_buf_file_callback(struct dentry *dentry) -{ - struct dentry *parent = dentry->d_parent; - debugfs_remove(dentry); - - /* - * this will fail for all but the last file, but that is ok. what we - * care about is the top level buts->name directory going away, when - * the last trace file is gone. Then we don't have to rmdir() that - * manually on trace stop, so it nicely solves the issue with - * force killing of running traces. - */ - - debugfs_remove(parent); - return 0; -} - -static struct dentry *blk_create_buf_file_callback(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, - .create_buf_file = blk_create_buf_file_callback, - .remove_buf_file = blk_remove_buf_file_callback, -}; - -/* - * Setup everything required to start tracing - */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) -{ - struct blk_trace *old_bt, *bt = NULL; - struct dentry *dir = NULL; - int ret, i; - - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; - - ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - goto err; - - bt->sequence = alloc_percpu(unsigned long); - if (!bt->sequence) - goto err; - - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); - if (!bt->msg_data) - goto err; - - ret = -ENOENT; - - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) - return -ENOMEM; - } - - dir = debugfs_create_dir(buts->name, blk_tree_root); - - if (!dir) - goto err; - - bt->dir = dir; - bt->dev = dev; - atomic_set(&bt->dropped, 0); - - ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, - &blk_dropped_fops); - if (!bt->dropped_file) - goto err; - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; - - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); - if (!bt->rchan) - goto err; - - bt->act_mask = buts->act_mask; - if (!bt->act_mask) - bt->act_mask = (u16) -1; - - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) - bt->end_lba = -1ULL; - - bt->pid = buts->pid; - bt->trace_state = Blktrace_setup; - - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); - - ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); - goto err; - } - - return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); -err: - if (bt) { - if (bt->msg_file) - debugfs_remove(bt->msg_file); - if (bt->dropped_file) - debugfs_remove(bt->dropped_file); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - if (bt->rchan) - relay_close(bt->rchan); - kfree(bt); - } - return ret; -} - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - char __user *arg) -{ - struct blk_user_trace_setup buts; - int ret; - - ret = copy_from_user(&buts, arg, sizeof(buts)); - if (ret) - return -EFAULT; - - ret = do_blk_trace_setup(q, name, dev, &buts); - if (ret) - return ret; - - if (copy_to_user(arg, &buts, sizeof(buts))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_setup); - -int blk_trace_startstop(struct request_queue *q, int start) -{ - int ret; - struct blk_trace *bt = q->blk_trace; - - if (bt == NULL) - return -EINVAL; - - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(blk_trace_startstop); - -/** - * blk_trace_ioctl: - handle the ioctls associated with tracing - * @bdev: the block device - * @cmd: the ioctl cmd - * @arg: the argument data, if any - * - **/ -int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) -{ - struct request_queue *q; - int ret, start = 0; - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - mutex_lock(&bdev->bd_mutex); - - switch (cmd) { - case BLKTRACESETUP: - bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); - break; - case BLKTRACESTART: - start = 1; - case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); - break; - case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); - break; - default: - ret = -ENOTTY; - break; - } - - mutex_unlock(&bdev->bd_mutex); - return ret; -} - -/** - * blk_trace_shutdown: - stop and cleanup trace structures - * @q: the request queue associated with the device - * - **/ -void blk_trace_shutdown(struct request_queue *q) -{ - if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); - } -} - -/* - * blktrace probes - */ - -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; - - if (likely(!bt)) - return; - - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); - - if (blk_pc_request(rq)) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - sizeof(rq->cmd), rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - rw, what, rq->errors, 0, NULL); - } -} - -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ABORT); -} - -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_INSERT); -} - -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); -} - -static void blk_add_trace_rq_requeue(struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); -} - -static void blk_add_trace_rq_complete(struct request_queue *q, - struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); -} - -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); -} - -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); -} - -static void blk_add_trace_bio_backmerge(struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); -} - -static void blk_add_trace_bio_frontmerge(struct request_queue *q, - struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); -} - -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); -} - -static void blk_add_trace_getrq(struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); - } -} - - -static void blk_add_trace_sleeprq(struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, - 0, 0, NULL); - } -} - -static void blk_add_trace_plug(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); -} - -static void blk_add_trace_unplug_io(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_unplug_timer(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, - unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); - } -} - -/** - * blk_add_trace_remap - Add a trace for a remap operation - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * @to: target sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); - - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); - else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); -} -EXPORT_SYMBOL_GPL(blk_add_driver_data); - -static int blk_register_tracepoints(void) -{ - int ret; - - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); - WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); - WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); - WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); - WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); - WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); - WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); - WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); - WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); - WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); - WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); - WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); - WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); - WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); - WARN_ON(ret); - return 0; -} - -static void blk_unregister_tracepoints(void) -{ - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); - - tracepoint_synchronize_unregister(); -} - -/* - * struct blk_io_tracer formatting routines - */ - -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) -{ - int i = 0; - - if (t->action & BLK_TC_DISCARD) - rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) - rwbs[i++] = 'W'; - else if (t->bytes) - rwbs[i++] = 'R'; - else - rwbs[i++] = 'N'; - - if (t->action & BLK_TC_AHEAD) - rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) - rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) - rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) - rwbs[i++] = 'M'; - - rwbs[i] = '\0'; -} - -static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) -{ - return (const struct blk_io_trace *)ent; -} - -static inline const void *pdu_start(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent) + 1; -} - -static inline u32 t_sec(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->bytes >> 9; -} - -static inline unsigned long long t_sector(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->sector; -} - -static inline __u16 t_error(const struct trace_entry *ent) -{ - return te_blk_io_trace(ent)->sector; -} - -static __u64 get_pdu_int(const struct trace_entry *ent) -{ - const __u64 *val = pdu_start(ent); - return be64_to_cpu(*val); -} - -static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) -{ - const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; - - r->device = be32_to_cpu(__r->device); - r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); -} - -static int blk_log_action_iter(struct trace_iterator *iter, const char *act) -{ - char rwbs[6]; - unsigned long long ts = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(ts, USEC_PER_SEC); - unsigned secs = (unsigned long)ts; - const struct trace_entry *ent = iter->ent; - const struct blk_io_trace *t = (const struct blk_io_trace *)ent; - - fill_rwbs(rwbs, t); - - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, usec_rem, ent->pid, act, rwbs); -} - -static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, - const char *act) -{ - char rwbs[6]; - fill_rwbs(rwbs, t); - return trace_seq_printf(s, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); -} - -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) -{ - const char *cmd = trace_find_cmdline(ent->pid); - - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); -} - -static int blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) -{ - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); -} - -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) -{ - struct blk_io_trace_remap r = { .device = 0, }; - - get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); -} - -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); -} - -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), - get_pdu_int(ent)); -} - -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) -{ - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), trace_find_cmdline(ent->pid)); -} - -/* - * struct tracer operations - */ - -static void blk_tracer_print_header(struct seq_file *m) -{ - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return; - seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" - "# | | | | | |\n"); -} - -static void blk_tracer_start(struct trace_array *tr) -{ - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) - if (blk_register_tracepoints()) - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; -} - -static int blk_tracer_init(struct trace_array *tr) -{ - blk_tr = tr; - blk_tracer_start(tr); - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled++; - mutex_unlock(&blk_probe_mutex); - return 0; -} - -static void blk_tracer_stop(struct trace_array *tr) -{ - trace_flags |= TRACE_ITER_CONTEXT_INFO; - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); -} - -static void blk_tracer_reset(struct trace_array *tr) -{ - if (!atomic_read(&blk_probes_ref)) - return; - - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled--; - WARN_ON(blk_tracer_enabled < 0); - mutex_unlock(&blk_probe_mutex); - - blk_tracer_stop(tr); -} - -static struct { - const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] __read_mostly = { - [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, - [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, - [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, - [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, - [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, - [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, - [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, - [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, - [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, - [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, - [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, - [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, - [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, - [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, - [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, -}; - -static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) -{ - struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); - int ret; - - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); - struct blk_io_trace old = { - .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, - .time = ns2usecs(iter->ts), - }; - - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); -} - -static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags) -{ - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) -{ - const struct blk_io_trace *t; - u16 what; - int ret; - - if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) - return TRACE_TYPE_UNHANDLED; - - t = (const struct blk_io_trace *)iter->ent; - what = t->action & ((1 << BLK_TC_SHIFT) - 1); - - if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) - ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); - else { - const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); - ret = blk_log_action_iter(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(&iter->seq, iter->ent); - } - - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -} - -static struct tracer blk_tracer __read_mostly = { - .name = "blk", - .init = blk_tracer_init, - .reset = blk_tracer_reset, - .start = blk_tracer_start, - .stop = blk_tracer_stop, - .print_header = blk_tracer_print_header, - .print_line = blk_tracer_print_line, - .flags = &blk_tracer_flags, -}; - -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, - .trace = blk_trace_event_print, - .latency_trace = blk_trace_event_print, - .binary = blk_trace_event_print_binary, -}; - -static int __init init_blk_tracer(void) -{ - if (!register_ftrace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); - return 1; - } - - if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); - unregister_ftrace_event(&trace_blk_event); - return 1; - } - - return 0; -} - -device_initcall(init_blk_tracer); - -static int blk_trace_remove_queue(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (bt == NULL) - return -EINVAL; - - kfree(bt); - return 0; -} - -/* - * Setup everything required to start tracing - */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) -{ - struct blk_trace *old_bt, *bt = NULL; - int ret; - - ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - goto err; - - bt->dev = dev; - bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; - bt->trace_state = Blktrace_running; - - old_bt = xchg(&q->blk_trace, bt); - if (old_bt != NULL) { - (void)xchg(&q->blk_trace, old_bt); - kfree(bt); - ret = -EBUSY; - } - return 0; -err: - return ret; -} - -/* - * sysfs interface to enable and configure tracing - */ - -static ssize_t sysfs_blk_trace_enable_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev != NULL) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q != NULL) { - mutex_lock(&bdev->bd_mutex); - ret = sprintf(buf, "%u\n", !!q->blk_trace); - mutex_unlock(&bdev->bd_mutex); - } - - bdput(bdev); - } - - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - int value; - ssize_t ret = -ENXIO; - - if (count == 0 || sscanf(buf, "%d", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - else - ret = blk_trace_remove_queue(q); - mutex_unlock(&bdev->bd_mutex); - - if (ret == 0) - ret = count; -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf); -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count); -#define BLK_TRACE_DEVICE_ATTR(_name) \ - DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ - sysfs_blk_trace_attr_show, \ - sysfs_blk_trace_attr_store) - -static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, - sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); -static BLK_TRACE_DEVICE_ATTR(act_mask); -static BLK_TRACE_DEVICE_ATTR(pid); -static BLK_TRACE_DEVICE_ATTR(start_lba); -static BLK_TRACE_DEVICE_ATTR(end_lba); - -static struct attribute *blk_trace_attrs[] = { - &dev_attr_enable.attr, - &dev_attr_act_mask.attr, - &dev_attr_pid.attr, - &dev_attr_start_lba.attr, - &dev_attr_end_lba.attr, - NULL -}; - -struct attribute_group blk_trace_attr_group = { - .name = "trace", - .attrs = blk_trace_attrs, -}; - -static int blk_str2act_mask(const char *str) -{ - int mask = 0; - char *copy = kstrdup(str, GFP_KERNEL), *s; - - if (copy == NULL) - return -ENOMEM; - - s = strstrip(copy); - - while (1) { - char *sep = strchr(s, ','); - - if (sep != NULL) - *sep = '\0'; - - if (strcasecmp(s, "barrier") == 0) - mask |= BLK_TC_BARRIER; - else if (strcasecmp(s, "complete") == 0) - mask |= BLK_TC_COMPLETE; - else if (strcasecmp(s, "fs") == 0) - mask |= BLK_TC_FS; - else if (strcasecmp(s, "issue") == 0) - mask |= BLK_TC_ISSUE; - else if (strcasecmp(s, "pc") == 0) - mask |= BLK_TC_PC; - else if (strcasecmp(s, "queue") == 0) - mask |= BLK_TC_QUEUE; - else if (strcasecmp(s, "read") == 0) - mask |= BLK_TC_READ; - else if (strcasecmp(s, "requeue") == 0) - mask |= BLK_TC_REQUEUE; - else if (strcasecmp(s, "sync") == 0) - mask |= BLK_TC_SYNC; - else if (strcasecmp(s, "write") == 0) - mask |= BLK_TC_WRITE; - - if (sep == NULL) - break; - - s = sep + 1; - } - kfree(copy); - - return mask; -} - -static ssize_t sysfs_blk_trace_attr_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q; - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - mutex_lock(&bdev->bd_mutex); - if (q->blk_trace == NULL) - ret = sprintf(buf, "disabled\n"); - else if (attr == &dev_attr_act_mask) - ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); - else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); - else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); - else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_attr_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - u64 value; - ssize_t ret = -ENXIO; - - if (count == 0) - goto out; - - if (attr == &dev_attr_act_mask) { - if (sscanf(buf, "%llx", &value) != 1) { - /* Assume it is a list of trace category names */ - value = blk_str2act_mask(buf); - if (value < 0) - goto out; - } - } else if (sscanf(buf, "%llu", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - ret = 0; - if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - - if (ret == 0) { - if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; - else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; - else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; - else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; - ret = count; - } - mutex_unlock(&bdev->bd_mutex); -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 25131a5d5e4f..4fee43c01942 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -302,6 +302,29 @@ config WORKQUEUE_TRACER For example it can help a developer to decide whether he should choose a per cpu workqueue instead of a singlethreaded one. +config BLK_DEV_IO_TRACE + bool "Support for tracing block io actions" + depends on SYSFS + select RELAY + select DEBUG_FS + select TRACEPOINTS + select TRACING + select STACKTRACE + help + Say Y here if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: + + git://git.kernel.dk/blktrace.git + + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + + If unsure, say N. config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f76d48f3527d..627090bc262d 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c new file mode 100644 index 000000000000..3b91da064820 --- /dev/null +++ b/kernel/trace/blktrace.c @@ -0,0 +1,1538 @@ +/* + * Copyright (C) 2006 Jens Axboe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace_output.h" + +static unsigned int blktrace_seq __read_mostly = 1; + +static struct trace_array *blk_tr; +static int __read_mostly blk_tracer_enabled; + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + +/* Global reference count of probes */ +static DEFINE_MUTEX(blk_probe_mutex); +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static int blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + +/* + * Send out a notify message. + */ +static void trace_note(struct blk_trace *bt, pid_t pid, int action, + const void *data, size_t len) +{ + struct blk_io_trace *t; + + if (!bt->rchan) + return; + + t = relay_reserve(bt->rchan, sizeof(*t) + len); + if (t) { + const int cpu = smp_processor_id(); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->time = ktime_to_ns(ktime_get()); + t->device = bt->dev; + t->action = action; + t->pid = pid; + t->cpu = cpu; + t->pdu_len = len; + memcpy((void *) t + sizeof(*t), data, len); + } +} + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +{ + tsk->btrace_seq = blktrace_seq; + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +} + +static void trace_note_time(struct blk_trace *bt) +{ + struct timespec now; + unsigned long flags; + u32 words[2]; + + getnstimeofday(&now); + words[0] = now.tv_sec; + words[1] = now.tv_nsec; + + local_irq_save(flags); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + local_irq_restore(flags); +} + +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + unsigned long flags; + char *buf; + + if (blk_tr) { + va_start(args, fmt); + ftrace_vprintk(fmt, args); + va_end(args); + return; + } + + if (!bt->msg_data) + return; + + local_irq_save(flags); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + va_start(args, fmt); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector < bt->start_lba || sector > bt->end_lba) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; + +/* The ilog2() calls fall out because they're constant */ +#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ + struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; + pid_t pid; + int cpu, pc = 0; + + if (unlikely(bt->trace_state != Blktrace_running || + !blk_tracer_enabled)) + return; + + what |= ddir_act[rw & WRITE]; + what |= MASK_TC_BIT(rw, BARRIER); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, META); + what |= MASK_TC_BIT(rw, DISCARD); + + pid = tsk->pid; + if (unlikely(act_log_check(bt, what, sector, pid))) + return; + cpu = raw_smp_processor_id(); + + if (blk_tr) { + tracing_record_cmdline(current); + + pc = preempt_count(); + event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, + sizeof(*t) + pdu_len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. + */ + local_irq_save(flags); + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(bt, tsk); + + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + if (t) { + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = bt->dev; + t->error = error; + t->pdu_len = pdu_len; + + if (pdu_len) + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tr) { + trace_buffer_unlock_commit(blk_tr, event, 0, pc); + return; + } + } + + local_irq_restore(flags); +} + +static struct dentry *blk_tree_root; +static DEFINE_MUTEX(blk_tree_mutex); + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + debugfs_remove(bt->msg_file); + debugfs_remove(bt->dropped_file); + relay_close(bt->rchan); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + kfree(bt); + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +int blk_trace_remove(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (!bt) + return -EINVAL; + + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) + blk_trace_cleanup(bt); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_remove); + +static int blk_dropped_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static const struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = blk_dropped_open, + .read = blk_dropped_read, +}; + +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count > BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + debugfs_remove(dentry); + + /* + * this will fail for all but the last file, but that is ok. what we + * care about is the top level buts->name directory going away, when + * the last trace file is gone. Then we don't have to rmdir() that + * manually on trace stop, so it nicely solves the issue with + * force killing of running traces. + */ + + debugfs_remove(parent); + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +/* + * Setup everything required to start tracing + */ +int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct blk_user_trace_setup *buts) +{ + struct blk_trace *old_bt, *bt = NULL; + struct dentry *dir = NULL; + int ret, i; + + if (!buts->buf_size || !buts->buf_nr) + return -EINVAL; + + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); + buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + for (i = 0; i < strlen(buts->name); i++) + if (buts->name[i] == '/') + buts->name[i] = '_'; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + + ret = -ENOENT; + + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) + return -ENOMEM; + } + + dir = debugfs_create_dir(buts->name, blk_tree_root); + + if (!dir) + goto err; + + bt->dir = dir; + bt->dev = dev; + atomic_set(&bt->dropped, 0); + + ret = -EIO; + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); + if (!bt->dropped_file) + goto err; + + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + + bt->rchan = relay_open("trace", dir, buts->buf_size, + buts->buf_nr, &blk_relay_callbacks, bt); + if (!bt->rchan) + goto err; + + bt->act_mask = buts->act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + bt->start_lba = buts->start_lba; + bt->end_lba = buts->end_lba; + if (!bt->end_lba) + bt->end_lba = -1ULL; + + bt->pid = buts->pid; + bt->trace_state = Blktrace_setup; + + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) { + ret = blk_register_tracepoints(); + if (ret) + goto probe_err; + } + mutex_unlock(&blk_probe_mutex); + + ret = -EBUSY; + old_bt = xchg(&q->blk_trace, bt); + if (old_bt) { + (void) xchg(&q->blk_trace, old_bt); + goto err; + } + + return 0; +probe_err: + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); +err: + if (bt) { + if (bt->msg_file) + debugfs_remove(bt->msg_file); + if (bt->dropped_file) + debugfs_remove(bt->dropped_file); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + if (bt->rchan) + relay_close(bt->rchan); + kfree(bt); + } + return ret; +} + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + int ret; + + ret = copy_from_user(&buts, arg, sizeof(buts)); + if (ret) + return -EFAULT; + + ret = do_blk_trace_setup(q, name, dev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts, sizeof(buts))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_setup); + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + struct blk_trace *bt = q->blk_trace; + + if (bt == NULL) + return -EINVAL; + + /* + * For starting a trace, we can transition from a setup or stopped + * trace. For stopping a trace, the state must be running + */ + ret = -EINVAL; + if (start) { + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) { + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + + trace_note_time(bt); + ret = 0; + } + } else { + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + relay_flush(bt->rchan); + ret = 0; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_startstop); + +/** + * blk_trace_ioctl: - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + struct request_queue *q; + int ret, start = 0; + char b[BDEVNAME_SIZE]; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + mutex_lock(&bdev->bd_mutex); + + switch (cmd) { + case BLKTRACESETUP: + bdevname(bdev, b); + ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + break; + case BLKTRACESTART: + start = 1; + case BLKTRACESTOP: + ret = blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&bdev->bd_mutex); + return ret; +} + +/** + * blk_trace_shutdown: - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(struct request_queue *q) +{ + if (q->blk_trace) { + blk_trace_startstop(q, 0); + blk_trace_remove(q); + } +} + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + int rw = rq->cmd_flags & 0x03; + + if (likely(!bt)) + return; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, + sizeof(rq->cmd), rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + rw, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, + !bio_flagged(bio, BIO_UPTODATE), 0, NULL); +} + +static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); +} + +static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); +} + +static void blk_add_trace_bio_backmerge(struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); +} + +static void blk_add_trace_bio_frontmerge(struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); +} + +static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE); +} + +static void blk_add_trace_getrq(struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); + } +} + +static void blk_add_trace_plug(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug_io(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_unplug_timer(struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, + sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_remap - Add a trace for a remap operation + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * @to: target sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from, sector_t to) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device = cpu_to_be32(dev); + r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector = cpu_to_be64(to); + + __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (blk_pc_request(rq)) + __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, + rq->errors, len, data); + else + __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + 0, BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static int blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug); + WARN_ON(ret); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + WARN_ON(ret); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split); + WARN_ON(ret); + ret = register_trace_block_remap(blk_add_trace_remap); + WARN_ON(ret); + return 0; +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_remap(blk_add_trace_remap); + unregister_trace_block_split(blk_add_trace_split); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); + unregister_trace_block_plug(blk_add_trace_plug); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq); + unregister_trace_block_getrq(blk_add_trace_getrq); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + + tracepoint_synchronize_unregister(); +} + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + + if (t->action & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (t->action & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (t->action & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (t->action & BLK_TC_BARRIER) + rwbs[i++] = 'B'; + if (t->action & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (t->action & BLK_TC_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector = __r->sector; + + r->device = be32_to_cpu(__r->device); + r->device_from = be32_to_cpu(__r->device_from); + r->sector = be64_to_cpu(sector); +} + +static int blk_log_action_iter(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + unsigned long long ts = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(ts, USEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct trace_entry *ent = iter->ent; + const struct blk_io_trace *t = (const struct blk_io_trace *)ent; + + fill_rwbs(rwbs, t); + + return trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%06lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, usec_rem, ent->pid, act, rwbs); +} + +static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, + const char *act) +{ + char rwbs[6]; + fill_rwbs(rwbs, t); + return trace_seq_printf(s, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *cmd = trace_find_cmdline(ent->pid); + + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); +} + +static int blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) +{ + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); +} + +static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device = 0, }; + + get_pdu_remap(ent, &r); + return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), + t_sec(ent), MAJOR(r.device), MINOR(r.device), + (unsigned long long)r.sector); +} + +static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); +} + +static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), + get_pdu_int(ent)); +} + +static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), trace_find_cmdline(ent->pid)); +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + mutex_lock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + if (blk_register_tracepoints()) + atomic_dec(&blk_probes_ref); + mutex_unlock(&blk_probe_mutex); + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled++; + mutex_unlock(&blk_probe_mutex); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + trace_flags |= TRACE_ITER_CONTEXT_INFO; + mutex_lock(&blk_probe_mutex); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + if (!atomic_read(&blk_probes_ref)) + return; + + mutex_lock(&blk_probe_mutex); + blk_tracer_enabled--; + WARN_ON(blk_tracer_enabled < 0); + mutex_unlock(&blk_probe_mutex); + + blk_tracer_stop(tr); +} + +static struct { + const char *act[2]; + int (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] __read_mostly = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags) +{ + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1); + int ret; + + if (!trace_print_context(iter)) + return TRACE_TYPE_PARTIAL_LINE; + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(s, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_seq(s, t, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(s, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = ns2usecs(iter->ts), + }; + + if (!trace_seq_putmem(s, &old, offset)) + return 0; + return trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +{ + return blk_trace_synthesize_old_trace(iter) ? + TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + const struct blk_io_trace *t; + u16 what; + int ret; + + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + t = (const struct blk_io_trace *)iter->ent; + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + + if (unlikely(what == 0 || what > ARRAY_SIZE(what2act))) + ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what); + else { + const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + ret = blk_log_action_iter(iter, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(&iter->seq, iter->ent); + } + + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .trace = blk_trace_event_print, + .latency_trace = blk_trace_event_print, + .binary = blk_trace_event_print_binary, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + kfree(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret; + + ret = -ENOMEM; + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + goto err; + + bt->dev = dev; + bt->act_mask = (u16)-1; + bt->end_lba = -1ULL; + bt->trace_state = Blktrace_running; + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + kfree(bt); + ret = -EBUSY; + } + return 0; +err: + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_enable_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev != NULL) { + struct request_queue *q = bdev_get_queue(bdev); + + if (q != NULL) { + mutex_lock(&bdev->bd_mutex); + ret = sprintf(buf, "%u\n", !!q->blk_trace); + mutex_unlock(&bdev->bd_mutex); + } + + bdput(bdev); + } + + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + int value; + ssize_t ret = -ENXIO; + + if (count == 0 || sscanf(buf, "%d", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + mutex_unlock(&bdev->bd_mutex); + + if (ret == 0) + ret = count; +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, + sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static int blk_str2act_mask(const char *str) +{ + int mask = 0; + char *copy = kstrdup(str, GFP_KERNEL), *s; + + if (copy == NULL) + return -ENOMEM; + + s = strstrip(copy); + + while (1) { + char *sep = strchr(s, ','); + + if (sep != NULL) + *sep = '\0'; + + if (strcasecmp(s, "barrier") == 0) + mask |= BLK_TC_BARRIER; + else if (strcasecmp(s, "complete") == 0) + mask |= BLK_TC_COMPLETE; + else if (strcasecmp(s, "fs") == 0) + mask |= BLK_TC_FS; + else if (strcasecmp(s, "issue") == 0) + mask |= BLK_TC_ISSUE; + else if (strcasecmp(s, "pc") == 0) + mask |= BLK_TC_PC; + else if (strcasecmp(s, "queue") == 0) + mask |= BLK_TC_QUEUE; + else if (strcasecmp(s, "read") == 0) + mask |= BLK_TC_READ; + else if (strcasecmp(s, "requeue") == 0) + mask |= BLK_TC_REQUEUE; + else if (strcasecmp(s, "sync") == 0) + mask |= BLK_TC_SYNC; + else if (strcasecmp(s, "write") == 0) + mask |= BLK_TC_WRITE; + + if (sep == NULL) + break; + + s = sep + 1; + } + kfree(copy); + + return mask; +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + lock_kernel(); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + mutex_lock(&bdev->bd_mutex); + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -ENXIO; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + value = blk_str2act_mask(buf); + if (value < 0) + goto out; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + lock_kernel(); + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out_unlock_kernel; + + q = bdev_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + ret = count; + } + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out_unlock_kernel: + unlock_kernel(); +out: + return ret; +} -- cgit v1.2.3-59-g8ed1b From 7447dce96f2233d250bc39a4a10a42f7c3dd46fc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Feb 2009 21:33:57 +0100 Subject: tracing/function-graph-tracer: provide a selftest for the function graph tracer Making it more easy to do a basic regression test for this tracer. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_functions_graph.c | 3 +++ kernel/trace/trace_selftest.c | 50 ++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b9838f4a6929..a011ec062225 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -500,6 +500,8 @@ extern int DYN_FTRACE_TEST_NAME(void); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr); extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_preemptoff(struct tracer *trace, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 222f97d336a6..88f8d9d80a93 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -750,6 +750,9 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function_graph, +#endif }; static __init int init_graph_trace(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 445700e51f6d..0c9aa1457e51 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,6 +13,8 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_PRINT: case TRACE_SPECIAL: case TRACE_BRANCH: + case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RET: return 1; } return 0; @@ -227,6 +229,54 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * Pretty much the same than for the function tracer from which the selftest + * has been borrowed. + */ +int +trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr) +{ + int ret; + unsigned long count; + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + + tracing_stop(); + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* Don't test dynamic tracing, the function tracer already did */ + +out: + /* Stop it if we failed */ + if (ret) + ftrace_graph_stop(); + + return ret; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + #ifdef CONFIG_IRQSOFF_TRACER int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) -- cgit v1.2.3-59-g8ed1b From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Feb 2009 22:16:12 +0100 Subject: tracing/power: move the power trace headers to a dedicated file Impact: cleanup Move the power tracer headers to trace/power.h to keep ftrace.h and power bits more easy to maintain as separated topics. Signed-off-by: Frederic Weisbecker Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- arch/x86/kernel/process.c | 2 +- include/linux/ftrace.h | 30 ------------------------- include/trace/power.h | 35 ++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_power.c | 2 +- 6 files changed, 39 insertions(+), 33 deletions(-) create mode 100644 include/trace/power.h diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..7ed925edf4d2 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e68bb9e30864..026819ffcb0c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 5e302d636fc2..106b7909d500 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod, unsigned long *start, unsigned long *end) { } #endif -enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, -}; - -struct power_trace { -#ifdef CONFIG_POWER_TRACER - ktime_t stamp; - ktime_t end; - int type; - int state; -#endif -}; - -#ifdef CONFIG_POWER_TRACER -extern void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_end(struct power_trace *it); -#else -static inline void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_end(struct power_trace *it) { } -#endif - - /* * Structure that defines an entry function trace. */ diff --git a/include/trace/power.h b/include/trace/power.h new file mode 100644 index 000000000000..c7cefbcdaea4 --- /dev/null +++ b/include/trace/power.h @@ -0,0 +1,35 @@ +#ifndef _TRACE_POWER_H +#define _TRACE_POWER_H + +#include + +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; + +struct power_trace { +#ifdef CONFIG_POWER_TRACER + ktime_t stamp; + ktime_t end; + int type; + int state; +#endif +}; + +#ifdef CONFIG_POWER_TRACER +extern void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state); +extern void trace_power_end(struct power_trace *it); +#else +static inline void trace_power_start(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_mark(struct power_trace *it, unsigned int type, + unsigned int state) { } +static inline void trace_power_end(struct power_trace *it) { } +#endif + +#endif /* _TRACE_POWER_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a011ec062225..1ecfb9d2b365 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -10,6 +10,7 @@ #include #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bfc21f8079ab..b1d0d087d3a6 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include -- cgit v1.2.3-59-g8ed1b From 3861a17bcc0af815f684c6178bc9ec2d790c350e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 8 Feb 2009 00:04:02 +0100 Subject: tracing/function-graph-tracer: drop the kernel_text_address check When the function graph tracer picks a return address, it ensures this address is really a kernel text one by calling __kernel_text_address() Actually this path has never been taken.Its role was more likely to debug the tracer on the beginning of its development but this function is wasteful since it is called for every traced function. The fault check is already sufficient. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 7 ------- kernel/extable.c | 4 ++-- kernel/module.c | 2 +- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d74d75e0952d..18828aee8781 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -491,13 +491,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - if (unlikely(!__kernel_text_address(old))) { - ftrace_graph_stop(); - *parent = old; - WARN_ON(1); - return; - } - calltime = cpu_clock(raw_smp_processor_id()); if (push_return_trace(old, calltime, diff --git a/kernel/extable.c b/kernel/extable.c index e136ed8d82ba..0df6253730be 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -__notrace_funcgraph int core_kernel_text(unsigned long addr) +int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) @@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr) return 0; } -__notrace_funcgraph int __kernel_text_address(unsigned long addr) +int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; diff --git a/kernel/module.c b/kernel/module.c index ba22484a987e..22d7379709da 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2735,7 +2735,7 @@ int is_module_address(unsigned long addr) /* Is this a valid kernel address? */ -__notrace_funcgraph struct module *__module_text_address(unsigned long addr) +struct module *__module_text_address(unsigned long addr) { struct module *mod; -- cgit v1.2.3-59-g8ed1b From b5db03c4355e568f1567758287b30a6a262d5057 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 7 Feb 2009 18:52:59 -0200 Subject: tracing: handle unregistering the current tracer Impact: simplification Instead of requiring that plugins have the sequence: my_tracer_stop(my_trace_array); unregister_tracer(my_tracer); it should be possible just do a: unregister_tracer(my_tracer); Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 03fbd4c20bc2..93040f1bef13 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -559,6 +559,15 @@ void unregister_tracer(struct tracer *type) found: *t = (*t)->next; + + if (type == current_trace && tracer_enabled) { + tracer_enabled = 0; + tracing_stop(); + if (current_trace->stop) + current_trace->stop(&global_trace); + current_trace = &nop_trace; + } + if (strlen(type->name) != max_tracer_type_len) goto out; -- cgit v1.2.3-59-g8ed1b From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 6 Feb 2009 15:37:47 -0800 Subject: softlockup: remove timestamp checking from hung_task Impact: saves sizeof(long) bytes per task_struct By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between tasklist scans we can avoid using timestamps. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/fork.c | 8 +++----- kernel/hung_task.c | 48 +++++++++--------------------------------------- 3 files changed, 12 insertions(+), 45 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a2811c6239d..e0d723fea9f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,7 +1241,6 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ - unsigned long last_switch_timestamp; unsigned long last_switch_count; #endif /* CPU-specific state of this task */ diff --git a/kernel/fork.c b/kernel/fork.c index fb9444282836..bf582f75014b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif tsk->mm = NULL; tsk->active_mm = NULL; @@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_HUNG_TASK - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3951a80e7cbe..0c924de58cb2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; -static unsigned long __read_mostly hung_task_poll_jiffies; unsigned long __read_mostly sysctl_hung_task_warnings = 10; @@ -69,33 +68,17 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(void) -{ - int this_cpu = raw_smp_processor_id(); - - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void check_hung_task(struct task_struct *t, unsigned long now, - unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; if (t->flags & PF_FROZEN) return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; - t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < timeout) - return; if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now, sched_show_task(t); __debug_show_held_locks(t); - t->last_switch_timestamp = now; touch_nmi_watchdog(); if (sysctl_hung_task_panic) @@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; int batch_count = HUNG_TASK_BATCHING; - unsigned long now = get_timestamp(); struct task_struct *g, *t; /* @@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now, timeout); + check_hung_task(t, timeout); } while_each_thread(g, t); unlock: rcu_read_unlock(); } -static void update_poll_jiffies(void) +static unsigned long timeout_jiffies(unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - if (sysctl_hung_task_timeout_secs == 0) - hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; - else - hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; } /* @@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, if (ret || !write) goto out; - update_poll_jiffies(); - wake_up_process(watchdog_task); out: @@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, static int watchdog(void *dummy) { set_user_nice(current, 0); - update_poll_jiffies(); for ( ; ; ) { - unsigned long timeout; + unsigned long timeout = sysctl_hung_task_timeout_secs; - while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; - /* - * Need to cache timeout here to avoid timeout being set - * to 0 via sysctl while inside check_hung_*_tasks(). - */ - timeout = sysctl_hung_task_timeout_secs; - if (timeout) - check_hung_uninterruptible_tasks(timeout); + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3-59-g8ed1b From 1dfba05d0f1a9b4245bb242a7c17fe448811a520 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Feb 2009 12:06:54 +0100 Subject: tracing/blktrace: move the tracing file to kernel/trace, fix Impact: build fix The BLK_DEV_IO_TRACE entry used to be in block/Kconfig - which file itself was dependent on CONFIG_BLOCK. But now the entry is in kernel/trace/Kconfig - which is present even on !CONFIG_BLOCK. So add a 'depends on BLOCK' to BLK_DEV_IO_TRACE. Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 4fee43c01942..3a331289457a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -305,6 +305,7 @@ config WORKQUEUE_TRACER config BLK_DEV_IO_TRACE bool "Support for tracing block io actions" depends on SYSFS + depends on BLOCK select RELAY select DEBUG_FS select TRACEPOINTS -- cgit v1.2.3-59-g8ed1b From b91facc367366b3f71375f337eb5997ec9ab4e69 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Feb 2009 18:30:44 +0100 Subject: tracing/function-graph-tracer: handle the leaf functions from trace_pipe When one cats the trace file, the leaf functions are printed without brackets: function(); whereas in the trace_pipe file we'll see the following: function() { } This is because the ring_buffer handling is not the same between those two files. On the trace file, when an entry is printed, the iterator advanced and then we can check the next entry. There is no iterator with trace_pipe, the current entry to print has been peeked and not consumed. So checking the next entry will still return the current one while we don't consume it. This patch introduces a new value for the output callbacks to ask the tracing core to not consume the current entry after printing it. We need it because we will have to consume the current entry ourself to check the next one. Now the trace_pipe is able to handle well the leaf functions. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 +-- kernel/trace/trace.h | 7 +++--- kernel/trace/trace_functions_graph.c | 48 ++++++++++++++++++++++-------------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 93040f1bef13..5b1e9a9e9906 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2468,8 +2468,8 @@ waitagain: iter->seq.len = len; break; } - - trace_consume(iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); if (iter->seq.len >= cnt) break; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1ecfb9d2b365..7b0518adf6d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -63,13 +63,13 @@ struct ftrace_entry { /* Function call entry */ struct ftrace_graph_ent_entry { - struct trace_entry ent; + struct trace_entry ent; struct ftrace_graph_ent graph_ent; }; /* Function return entry */ struct ftrace_graph_ret_entry { - struct trace_entry ent; + struct trace_entry ent; struct ftrace_graph_ret ret; }; extern struct tracer boot_tracer; @@ -309,7 +309,8 @@ extern void __ftrace_bad_type(void); enum print_line_t { TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 88f8d9d80a93..782ec0fdf453 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -212,8 +212,8 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) return ret; } -static bool -trace_branch_is_leaf(struct trace_iterator *iter, +static struct ftrace_graph_ret_entry * +get_return_for_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *curr) { struct ring_buffer_iter *ring_iter; @@ -222,24 +222,33 @@ trace_branch_is_leaf(struct trace_iterator *iter, ring_iter = iter->buffer_iter[iter->cpu]; - if (!ring_iter) - return false; - - event = ring_buffer_iter_peek(ring_iter, NULL); + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* We need to consume the current entry to see the next one */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL); + } if (!event) - return false; + return NULL; next = ring_buffer_event_data(event); if (next->ent.type != TRACE_GRAPH_RET) - return false; + return NULL; if (curr->ent.pid != next->ent.pid || curr->graph_ent.func != next->ret.func) - return false; + return NULL; - return true; + /* this is a leaf, now advance the iterator */ + if (ring_iter) + ring_buffer_read(ring_iter, NULL); + + return next; } /* Signal a overhead of time execution to the output */ @@ -376,18 +385,15 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s) /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, - struct ftrace_graph_ent_entry *entry, struct trace_seq *s) + struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) { - struct ftrace_graph_ret_entry *ret_entry; struct ftrace_graph_ret *graph_ret; - struct ring_buffer_event *event; struct ftrace_graph_ent *call; unsigned long long duration; int ret; int i; - event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - ret_entry = ring_buffer_event_data(event); graph_ret = &ret_entry->ret; call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; @@ -457,7 +463,11 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + /* + * we already consumed the current entry to check the next one + * and see if this is a leaf. + */ + return TRACE_TYPE_NO_CONSUME; } static enum print_line_t @@ -469,6 +479,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, pid_t *last_entry = iter->private; struct trace_entry *ent = iter->ent; struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; /* Pid */ if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) @@ -504,8 +515,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - if (trace_branch_is_leaf(iter, field)) - return print_graph_entry_leaf(iter, field, s); + leaf_ret = get_return_for_leaf(iter, field); + if (leaf_ret) + return print_graph_entry_leaf(iter, field, leaf_ret, s); else return print_graph_entry_nested(field, s, iter->ent->pid, cpu); -- cgit v1.2.3-59-g8ed1b From 2bc275e9b04f23708c495a8bfe92a5f4b65345c8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 5 Feb 2009 13:08:11 +0200 Subject: UBIFS: fix dbg_chk_lpt_sz() The debugging function dbg_chk_lpt_sz() was not working correctly for small min_io_unit size e.g. NOR flash. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/lpt_commit.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 3216a1f277f8..27c97a1873d5 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -416,9 +416,8 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 4, alen - wlen); } - dbg_chk_lpt_sz(c, 2, 0); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -477,7 +476,7 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -504,7 +503,7 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -1756,10 +1755,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) /** * dbg_chk_lpt_sz - check LPT does not write more than LPT size. * @c: the UBIFS file-system description object - * @action: action + * @action: what to do * @len: length written * * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; */ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { -- cgit v1.2.3-59-g8ed1b From 3c56819b14b00dd449bd776303e61f8532fad09f Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 9 Feb 2009 08:15:56 +0200 Subject: tracing: splice support for tracing_pipe Added and implemented tracing_pipe_fops->splice_read(). This allows userspace programs to get tracing data more efficiently. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 6 +++ 2 files changed, 142 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5b1e9a9e9906..9e29fdb0dfe5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -364,6 +365,25 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) return cnt; } +ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +{ + int len; + void *ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = memcpy(buf, s->buffer + s->readpos, cnt); + if (!ret) + return -EFAULT; + + s->readpos += len; + return cnt; +} + static void trace_print_seq(struct seq_file *m, struct trace_seq *s) { @@ -2493,6 +2513,121 @@ out: return sret; } +static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + __free_page(buf->page); +} + +static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, + unsigned int idx) +{ + __free_page(spd->pages[idx]); +} + +static struct pipe_buf_operations tracing_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = tracing_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t tracing_splice_read_pipe(struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct trace_iterator *iter = filp->private_data; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .nr_pages = 0, /* This gets updated below. */ + .flags = flags, + .ops = &tracing_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, + }; + ssize_t ret; + size_t count, rem; + unsigned int i; + + mutex_lock(&trace_types_lock); + + if (iter->trace->splice_read) { + ret = iter->trace->splice_read(iter, filp, + ppos, pipe, len, flags); + if (ret) + goto out; + } + + ret = tracing_wait_pipe(filp); + if (ret <= 0) + goto out; + + if (!iter->ent && !find_next_entry_inc(iter)) { + ret = -EFAULT; + goto out; + } + + /* Fill as many pages as possible. */ + for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { + pages[i] = alloc_page(GFP_KERNEL); + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { + count = iter->seq.len; + ret = print_trace_line(iter); + count = iter->seq.len - count; + if (rem < count) { + rem = 0; + iter->seq.len -= count; + break; + } + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.len -= count; + break; + } + + trace_consume(iter); + rem -= count; + if (!find_next_entry_inc(iter)) { + rem = 0; + iter->ent = NULL; + break; + } + } + + /* Copy the data into the page, so we can start over. */ + ret = trace_seq_to_buffer(&iter->seq, + page_address(pages[i]), + iter->seq.len); + if (ret < 0) { + __free_page(pages[i]); + break; + } + partial[i].offset = 0; + partial[i].len = iter->seq.len; + + trace_seq_reset(&iter->seq); + } + + mutex_unlock(&trace_types_lock); + + spd.nr_pages = i; + + return splice_to_pipe(pipe, &spd); + +out: + mutex_unlock(&trace_types_lock); + + return ret; +} + static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -2656,6 +2791,7 @@ static struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, .poll = tracing_poll_pipe, .read = tracing_read_pipe, + .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7b0518adf6d7..dbff0207b213 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -353,6 +353,12 @@ struct tracer { ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); + ssize_t (*splice_read)(struct trace_iterator *iter, + struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); -- cgit v1.2.3-59-g8ed1b From ff98781bab2735e6c89793034173e0cb5007a7e5 Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Mon, 9 Feb 2009 08:15:55 +0200 Subject: tracing: Move pipe waiting code out of tracing_read_pipe(). This moves the pipe waiting code from tracing_read_pipe() into tracing_wait_pipe(), which is useful to implement other fops, like splice_read. Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 69 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e29fdb0dfe5..11fde0ad9cb6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2388,37 +2388,15 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } } -/* - * Consumer reader. - */ -static ssize_t -tracing_read_pipe(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +/* Must be called with trace_types_lock mutex held. */ +static int tracing_wait_pipe(struct file *filp) { struct trace_iterator *iter = filp->private_data; - ssize_t sret; - - /* return any leftover data */ - sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (sret != -EBUSY) - return sret; - - trace_seq_reset(&iter->seq); - mutex_lock(&trace_types_lock); - if (iter->trace->read) { - sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); - if (sret) - goto out; - } - -waitagain: - sret = 0; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { - sret = -EAGAIN; - goto out; + return -EAGAIN; } /* @@ -2443,12 +2421,11 @@ waitagain: iter->tr->waiter = NULL; if (signal_pending(current)) { - sret = -EINTR; - goto out; + return -EINTR; } if (iter->trace != current_trace) - goto out; + return 0; /* * We block until we read something and tracing is disabled. @@ -2465,9 +2442,43 @@ waitagain: continue; } + return 1; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + + trace_seq_reset(&iter->seq); + + mutex_lock(&trace_types_lock); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + +waitagain: + sret = tracing_wait_pipe(filp); + if (sret <= 0) + goto out; + /* stop when tracing is finished */ - if (trace_empty(iter)) + if (trace_empty(iter)) { + sret = 0; goto out; + } if (cnt >= PAGE_SIZE) cnt = PAGE_SIZE - 1; -- cgit v1.2.3-59-g8ed1b From 34cd4998d38f9bd04f34b78a7cb0c7f1bee00bd9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Feb 2009 12:06:29 -0500 Subject: tracing: clean up splice code Ingo Molnar suggested a series of clean ups for the splice code. This patch implements those suggestions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 96 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 41 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 11fde0ad9cb6..d89821283b47 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2537,15 +2537,49 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, } static struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, - .steal = generic_pipe_buf_steal, - .get = generic_pipe_buf_get, + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = tracing_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, }; +static size_t +tracing_fill_pipe_page(struct page *pages, size_t rem, + struct trace_iterator *iter) +{ + size_t count; + int ret; + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { + count = iter->seq.len; + ret = print_trace_line(iter); + count = iter->seq.len - count; + if (rem < count) { + rem = 0; + iter->seq.len -= count; + break; + } + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.len -= count; + break; + } + + trace_consume(iter); + rem -= count; + if (!find_next_entry_inc(iter)) { + rem = 0; + iter->ent = NULL; + break; + } + } + + return rem; +} + static ssize_t tracing_splice_read_pipe(struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, @@ -2556,15 +2590,15 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, struct partial_page partial[PIPE_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages = 0, /* This gets updated below. */ - .flags = flags, - .ops = &tracing_pipe_buf_ops, - .spd_release = tracing_spd_release_pipe, + .pages = pages, + .partial = partial, + .nr_pages = 0, /* This gets updated below. */ + .flags = flags, + .ops = &tracing_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, }; ssize_t ret; - size_t count, rem; + size_t rem; unsigned int i; mutex_lock(&trace_types_lock); @@ -2573,45 +2607,25 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = iter->trace->splice_read(iter, filp, ppos, pipe, len, flags); if (ret) - goto out; + goto out_err; } ret = tracing_wait_pipe(filp); if (ret <= 0) - goto out; + goto out_err; if (!iter->ent && !find_next_entry_inc(iter)) { ret = -EFAULT; - goto out; + goto out_err; } /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + break; - /* Seq buffer is page-sized, exactly what we need. */ - for (;;) { - count = iter->seq.len; - ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; - break; - } - if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; - break; - } - - trace_consume(iter); - rem -= count; - if (!find_next_entry_inc(iter)) { - rem = 0; - iter->ent = NULL; - break; - } - } + rem = tracing_fill_pipe_page(pages[i], rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, @@ -2633,7 +2647,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, return splice_to_pipe(pipe, &spd); -out: +out_err: mutex_unlock(&trace_types_lock); return ret; -- cgit v1.2.3-59-g8ed1b From b85fa01ed958ca59523a2db3c2ee647b98745d6a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 9 Feb 2009 14:21:14 +0800 Subject: ring_buffer: fix typing mistake Impact: Fix bug I found several very very curious line. It's so curious that it may be brought by typing mistake. When (cpu_buffer->reader_page == cpu_buffer->commit_page): 1) We haven't copied it for bpage is changed: bpage = cpu_buffer->reader_page->page; memcpy(bpage->data, cpu_buffer->reader_page->page->data + read ... ) 2) We need update cpu_buffer->reader_page->read, but "cpu_buffer->reader_page += read;" is not right. [ This bug was a typo. The commit->reader_page is a page pointer and not an index into the page. The line should have been commit->reader_page->read += read. The other changes by Lai are nice clean ups to the code. - SDR ] Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 53ba3a6d16d0..eca282720838 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2406,7 +2406,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * to swap with a page in the ring buffer. * * for example: - * rpage = ring_buffer_alloc_page(buffer); + * rpage = ring_buffer_alloc_read_page(buffer); * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); @@ -2461,18 +2461,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, */ if (cpu_buffer->reader_page == cpu_buffer->commit_page) { unsigned int read = cpu_buffer->reader_page->read; + unsigned int commit = rb_page_commit(cpu_buffer->reader_page); if (full) goto out; /* The writer is still on the reader page, we must copy */ - bpage = cpu_buffer->reader_page->page; memcpy(bpage->data, cpu_buffer->reader_page->page->data + read, - local_read(&bpage->commit) - read); + commit - read); /* consume what was read */ - cpu_buffer->reader_page += read; - + cpu_buffer->reader_page->read = commit; } else { /* swap the pages */ rb_init_page(bpage); -- cgit v1.2.3-59-g8ed1b From 667d24125839b6f3363d8177d7ed9fab8a40e45f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 9 Feb 2009 14:21:17 +0800 Subject: ring_buffer: fix ring_buffer_read_page() Impact: change API and init bpage when copy ring_buffer_read_page()/rb_remove_entries() may be called for a partially consumed page. Add a parameter for rb_remove_entries() and make it update cpu_buffer->entries correctly for partially consumed pages. ring_buffer_read_page() now returns the offset to the next event. Init the bpage's time_stamp when return value is 0. Signed-off-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eca282720838..10d202ea06f3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2332,13 +2332,14 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage) + struct buffer_data_page *bpage, + unsigned int offset) { struct ring_buffer_event *event; unsigned long head; __raw_spin_lock(&cpu_buffer->lock); - for (head = 0; head < local_read(&bpage->commit); + for (head = offset; head < local_read(&bpage->commit); head += rb_event_length(event)) { event = __rb_data_page_index(bpage, head); @@ -2410,8 +2411,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * if (!rpage) * return error; * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); - * if (ret) - * process_page(rpage); + * if (ret >= 0) + * process_page(rpage, ret); * * When @full is set, the function will not return true unless * the writer is off the reader page. @@ -2422,8 +2423,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * responsible for that. * * Returns: - * 1 if data has been transferred - * 0 if no data has been transferred. + * >=0 if data has been transferred, returns the offset of consumed data. + * <0 if no data has been transferred. */ int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, int cpu, int full) @@ -2432,7 +2433,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, struct ring_buffer_event *event; struct buffer_data_page *bpage; unsigned long flags; - int ret = 0; + unsigned int read; + int ret = -1; if (!data_page) return 0; @@ -2454,24 +2456,29 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* check for data */ if (!local_read(&cpu_buffer->reader_page->page->commit)) goto out; + + read = cpu_buffer->reader_page->read; /* * If the writer is already off of the read page, then simply * switch the read page with the given page. Otherwise * we need to copy the data from the reader to the writer. */ if (cpu_buffer->reader_page == cpu_buffer->commit_page) { - unsigned int read = cpu_buffer->reader_page->read; unsigned int commit = rb_page_commit(cpu_buffer->reader_page); + struct buffer_data_page *rpage = cpu_buffer->reader_page->page; if (full) goto out; /* The writer is still on the reader page, we must copy */ - memcpy(bpage->data, - cpu_buffer->reader_page->page->data + read, - commit - read); + memcpy(bpage->data + read, rpage->data + read, commit - read); /* consume what was read */ cpu_buffer->reader_page->read = commit; + + /* update bpage */ + local_set(&bpage->commit, commit); + if (!read) + bpage->time_stamp = rpage->time_stamp; } else { /* swap the pages */ rb_init_page(bpage); @@ -2480,10 +2487,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer, cpu_buffer->reader_page->read = 0; *data_page = bpage; } - ret = 1; + ret = read; /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage); + rb_remove_entries(cpu_buffer, bpage, read); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v1.2.3-59-g8ed1b From 966657883fdc3a2883a5e641ca4ec8f79ffb8ecd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Feb 2009 11:53:23 -0500 Subject: tracing, x86: fix constraint for parent variable The constraint used for retrieving and restoring the parent function pointer is incorrect. The parent variable is a pointer, and the address of the pointer is modified by the asm statement and not the pointer itself. It is incorrect to pass it in as an output constraint since the asm will never update the pointer. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 18828aee8781..370bafaa43a3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -468,8 +468,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( - "1: " _ASM_MOV " (%[parent_old]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" + "1: " _ASM_MOV " (%[parent]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" @@ -479,9 +479,8 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) _ASM_EXTABLE(1b, 3b) _ASM_EXTABLE(2b, 3b) - : [parent_replaced] "=r" (parent), [old] "=r" (old), - [faulted] "=r" (faulted) - : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : [old] "=r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) : "memory" ); -- cgit v1.2.3-59-g8ed1b From 4543ae7ce1cb8b5ff27a59009e7991ea63791a71 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 9 Feb 2009 23:09:32 +0100 Subject: tracing: storage class should be before const qualifier The C99 specification states in section 6.11.5: The placement of a storage-class specifier other than at the beginning of the declaration specifiers in a declaration is an obsolescent feature. Signed-off-by: Tobias Klauser Signed-off-by: Steven Rostedt --- kernel/trace/trace_sysprof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 84ca9d81e74d..4e2de4de1d65 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } } -const static struct stacktrace_ops backtrace_ops = { +static const struct stacktrace_ops backtrace_ops = { .warning = backtrace_warning, .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, -- cgit v1.2.3-59-g8ed1b From f54fc98aa656f334c1571df6e3ca9178ea223847 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 10 Feb 2009 01:02:46 -0500 Subject: tracing: remove unneeded variable Impact: clean up. Remove the unnecessary variable ret. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index f8ae2c50e01d..c2e68d440c4d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -91,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) int enable_branch_tracing(struct trace_array *tr) { - int ret = 0; - mutex_lock(&branch_tracing_mutex); branch_tracer = tr; /* @@ -103,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr) branch_tracing_enabled++; mutex_unlock(&branch_tracing_mutex); - return ret; + return 0; } void disable_branch_tracing(void) -- cgit v1.2.3-59-g8ed1b From 810dc73265cd690b2bc6010489b4317bba2cda39 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 10 Feb 2009 01:03:05 -0500 Subject: tracing: provide correct return value after outputting the event This patch is to make the function return early on failure, and give correct return value on success. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 782ec0fdf453..519a0cab1530 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -186,30 +186,30 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) ret = trace_seq_printf(s, " ------------------------------------------\n"); if (!ret) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; ret = print_graph_cpu(s, cpu); if (ret == TRACE_TYPE_PARTIAL_LINE) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; ret = print_graph_proc(s, prev_pid); if (ret == TRACE_TYPE_PARTIAL_LINE) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, " => "); if (!ret) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, "\n ------------------------------------------\n\n"); if (!ret) - TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_PARTIAL_LINE; - return ret; + return TRACE_TYPE_HANDLED; } static struct ftrace_graph_ret_entry * -- cgit v1.2.3-59-g8ed1b From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 10 Feb 2009 01:03:18 -0500 Subject: tracing: fix typos in comments Impact: clean up. Fix typos in the comments. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace.c | 2 +- kernel/trace/trace_hw_branches.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 3c103d636da3..8e6646a54acf 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -8,7 +8,7 @@ struct ring_buffer; struct ring_buffer_iter; /* - * Don't reference this struct directly, use functions below. + * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { u32 type:2, len:3, time_delta:27; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 10d202ea06f3..fa64e1f003eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off); * tracing_off_permanent - permanently disable ring buffers * * This function, once called, will disable all ring buffers - * permanenty. + * permanently. */ void tracing_off_permanent(void) { @@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); struct buffer_data_page { u64 time_stamp; /* page time stamp */ - local_t commit; /* write commited index */ + local_t commit; /* write committed index */ unsigned char data[]; /* data of buffer page */ }; @@ -260,7 +260,7 @@ struct ring_buffer_per_cpu { struct list_head pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ - struct buffer_page *commit_page; /* commited pages */ + struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; unsigned long overrun; unsigned long entries; @@ -303,7 +303,7 @@ struct ring_buffer_iter { * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test * - * As a safty measure we check to make sure the data pages have not + * As a safety measure we check to make sure the data pages have not * been corrupted. */ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d89821283b47..d7c175a442df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, struct tracer_opt *trace_opts = current_trace->flags->opts; - /* calulate max size */ + /* calculate max size */ for (i = 0; trace_options[i]; i++) { len += strlen(trace_options[i]); len += 3; /* "no" and space */ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e3e7db61c067..0794dd33f27b 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr) } /* - * Start tracing on the current cpu. + * Stop tracing on the current cpu. * The argument is ignored. * * pre: bts_tracer_mutex must be locked. -- cgit v1.2.3-59-g8ed1b From 4fd2735881bf4d8bf5e30979f31fc2f1b1d505fa Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Tue, 10 Feb 2009 19:44:12 +0100 Subject: tracing: fix sparse warnings: make symbols static Impact: make global variables and a global function static The function '__trace_userstack' does not seem to have a caller, so it is commented out. Fix this sparse warnings: kernel/trace/trace.c:82:5: warning: symbol 'tracing_disabled' was not declared. Should it be static? kernel/trace/trace.c:600:10: warning: symbol 'trace_record_cmdline_disabled' was not declared. Should it be static? kernel/trace/trace.c:957:6: warning: symbol '__trace_userstack' was not declared. Should it be static? kernel/trace/trace.c:1694:5: warning: symbol 'tracing_release' was not declared. Should it be static? Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d7c175a442df..c157ba70f30c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -80,7 +80,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) * of the tracer is successful. But that is the only place that sets * this back to zero. */ -int tracing_disabled = 1; +static int tracing_disabled = 1; static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); @@ -626,7 +626,7 @@ static int cmdline_idx; static DEFINE_SPINLOCK(trace_cmdline_lock); /* temporary disable recording */ -atomic_t trace_record_cmdline_disabled __read_mostly; +static atomic_t trace_record_cmdline_disabled __read_mostly; static void trace_init_cmdlines(void) { @@ -983,10 +983,12 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif } -void __trace_userstack(struct trace_array *tr, unsigned long flags) +#ifdef UNUSED +static void __trace_userstack(struct trace_array *tr, unsigned long flags) { ftrace_trace_userstack(tr, flags, preempt_count()); } +#endif /* UNUSED */ static void ftrace_trace_special(void *__tr, @@ -1720,7 +1722,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } -int tracing_release(struct inode *inode, struct file *file) +static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct trace_iterator *iter = m->private; -- cgit v1.2.3-59-g8ed1b From 5e39841c45cf5e6ea930ede1b0303309e03037a2 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Tue, 10 Feb 2009 19:44:34 +0100 Subject: tracing: fix sparse warnings: fix (un-)signedness Fix these sparse warnings: kernel/trace/ring_buffer.c:70:37: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:84:39: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:96:43: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2500:40: warning: incorrect type in argument 3 (different signedness) kernel/trace/ring_buffer.c:2505:44: warning: incorrect type in argument 2 (different signedness) kernel/trace/ring_buffer.c:2507:46: warning: incorrect type in argument 2 (different signedness) kernel/trace/trace.c:2130:40: warning: incorrect type in argument 3 (different signedness) kernel/trace/trace.c:2280:40: warning: incorrect type in argument 3 (different signedness) Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++++---- kernel/trace/trace.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fa64e1f003eb..dc18b5b9ccb4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -59,7 +59,7 @@ enum { RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, }; -static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; +static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; /** * tracing_on - enable all tracing buffers @@ -2501,7 +2501,7 @@ static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - long *p = filp->private_data; + unsigned long *p = filp->private_data; char buf[64]; int r; @@ -2517,9 +2517,9 @@ static ssize_t rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - long *p = filp->private_data; + unsigned long *p = filp->private_data; char buf[64]; - long val; + unsigned long val; int ret; if (cnt >= sizeof(buf)) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c157ba70f30c..e2434990277c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2147,7 +2147,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; char buf[64]; - long val; + unsigned long val; int ret; if (cnt >= sizeof(buf)) @@ -2295,9 +2295,9 @@ static ssize_t tracing_max_lat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - long *ptr = filp->private_data; + unsigned long *ptr = filp->private_data; char buf[64]; - long val; + unsigned long val; int ret; if (cnt >= sizeof(buf)) -- cgit v1.2.3-59-g8ed1b From e7669b8e329255bbcb40af65b38e342825d97a46 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Tue, 10 Feb 2009 19:44:45 +0100 Subject: tracing: fix sparse warning: attribute function with __acquires/__releases Fix this sparse warning: kernel/trace/trace.c:458:9: warning: context imbalance in 'register_tracer' - unexpected unlock Signed-off-by: Hannes Eder Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e2434990277c..95f99a7abf2f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -459,6 +459,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * Register a new plugin tracer. */ int register_tracer(struct tracer *type) +__releases(kernel_lock) +__acquires(kernel_lock) { struct tracer *t; int len; -- cgit v1.2.3-59-g8ed1b From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 16:52:37 +0100 Subject: softlockup: ensure the task has been switched out once When we check if a task has been switched out since the last scan, we might have a race condition on the following scenario: - the task is freshly created and scheduled - it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out - check_hung_task() scans this task and will report a false positive because t->nvcsw + t->nivcsw == t->last_switch_count == 0 Add a check for such cases. Signed-off-by: Frederic Weisbecker Acked-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c924de58cb2..022a4927b785 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; - if (t->flags & PF_FROZEN) + /* + * Ensure the task is not frozen. + * Also, when a freshly created task is scheduled once, changes + * its state to TASK_UNINTERRUPTIBLE without having ever been + * switched out once, it musn't be checked. + */ + if (unlikely(t->flags & PF_FROZEN || !switch_count)) return; if (switch_count != t->last_switch_count) { -- cgit v1.2.3-59-g8ed1b From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Feb 2009 15:49:11 +0100 Subject: tracing/sysprof: add missing tracing_{start,stop}_record_cmdline() Add the missing pair tracing_{start,stop}_record_cmdline() to record well the cmdline associated with pid. Changes in v2: - fix a build error, the sched_switch tracer is needed to record the cmdline. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + kernel/trace/trace_sysprof.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3a331289457a..6ff928acd453 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,6 +134,7 @@ config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 select TRACING + select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace tool. diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 84ca9d81e74d..9902c15997ad 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr) { sysprof_trace = tr; + tracing_start_cmdline_record(); + mutex_lock(&sample_timer_lock); start_stack_timers(); tracer_enabled = 1; @@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr) static void stack_trace_reset(struct trace_array *tr) { + tracing_stop_cmdline_record(); stop_stack_trace(tr); } -- cgit v1.2.3-59-g8ed1b From 00f62f614bb713027b9296068d1879fbca511eb7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Feb 2009 17:04:06 -0200 Subject: ring_buffer: pahole struct ring_buffer While fixing some bugs in pahole (built-in.o files were not being processed due to relocation problems) I found out about these packable structures: $ pahole --packable kernel/trace/ring_buffer.o | grep ring ring_buffer 72 64 8 ring_buffer_per_cpu 112 104 8 If we take a look at the current layout of struct ring_buffer we can see that we have two 4 bytes holes. $ pahole -C ring_buffer kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ cpumask_var_t cpumask; /* 16 8 */ atomic_t record_disabled; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct mutex mutex; /* 32 32 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct ring_buffer_per_cpu * * buffers; /* 64 8 */ /* size: 72, cachelines: 2, members: 7 */ /* sum members: 64, holes: 2, sum holes: 8 */ /* last cacheline: 8 bytes */ }; So, if I ask pahole to reorganize it: $ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o struct ring_buffer { unsigned int pages; /* 0 4 */ unsigned int flags; /* 4 4 */ int cpus; /* 8 4 */ atomic_t record_disabled; /* 12 4 */ cpumask_var_t cpumask; /* 16 8 */ struct mutex mutex; /* 24 32 */ struct ring_buffer_per_cpu * * buffers; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ /* size: 64, cachelines: 1, members: 7 */ }; /* saved 8 bytes and 1 cacheline! */ We get it using just one 64 bytes cacheline. To see what it did: $ pahole -C ring_buffer --reorganize --show_reorg_steps \ kernel/trace/ring_buffer.o | grep \/ /* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */ Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 53ba3a6d16d0..27ef3bf13ed2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -273,8 +273,8 @@ struct ring_buffer { unsigned pages; unsigned flags; int cpus; - cpumask_var_t cpumask; atomic_t record_disabled; + cpumask_var_t cpumask; struct mutex mutex; -- cgit v1.2.3-59-g8ed1b From 9f8d979f082a3ee1b27f32b7e0811b51c3ad1d15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 12 Feb 2009 13:10:17 +0100 Subject: softlockup: move 'one' to the softlockup section in sysctl.c CONFIG_SOFTLOCKUP=y || CONFIG_DETECT_HUNG_TASKS=y is now the only user of the 'one' constant in kernel/sysctl.c. Move it to the softlockup block of constants. This fixes a GCC warning. Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b6b54c8ac0d..6d2aeff92b3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -90,6 +90,9 @@ extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ /* Constants used for minimum and maximum */ +#if defined(CONFIG_DETECT_HUNG_TASK) || defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) +static int one = 1; +#endif #ifdef CONFIG_DETECT_SOFTLOCKUP static int sixty = 60; static int neg_one = -1; @@ -100,7 +103,6 @@ static int two = 2; #endif static int zero; -static int one = 1; static unsigned long one_ul = 1; static int one_hundred = 100; -- cgit v1.2.3-59-g8ed1b From 5a5fb7dbe88dd57dc2bef0f3be9da991e789612d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Feb 2009 10:53:37 -0500 Subject: preempt-count: force hardirq-count to max of 10 To add a bit in the preempt_count to be set when in NMI context, we found that some archs did not have enough bits to spare. This is due to the hardirq_count being a mask that can hold NR_IRQS. Some archs allow for over 16000 IRQs, and that would require a mask of 14 bits. The sofitrq mask is 8 bits and the preempt disable mask is also 8 bits. The PREEMP_ACTIVE bit is bit 30, and bit 31 would make the preempt_count (which is type int) a negative number. A negative preempt_count is a sign of failure. Add them up 14+8+8+1+1 you get 32 bits. No room for the NMI bit. But the hardirq_count is to track the number of nested IRQs, not the number of total IRQs. This originally took the paranoid approach of setting the max nesting to NR_IRQS. But when we have archs with over 1000 IRQs, it is not practical to think they will ever all nest on a single CPU. Not to mention that this would most definitely cause a stack overflow. This patch sets a max of 10 bits to be used for IRQ nesting. I did a 'git grep HARDIRQ' to examine all users of HARDIRQ_BITS and HARDIRQ_MASK, and found that making it a max of 10 would not hurt anyone. I did find that the m68k expected it to be 8 bits, so I allow for the archs to set the number to be less than 10. I removed the setting of HARDIRQ_BITS from the archs that set it to more than 10. This includes ALPHA, ia64 and avr32. This will always allow room for the NMI bit, and if we need to allow for NMI nesting, we have 4 bits to play with. Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/hardirq.h | 13 ----------- arch/avr32/include/asm/hardirq.h | 11 --------- arch/ia64/include/asm/hardirq.h | 10 --------- include/linux/hardirq.h | 48 ++++++++++++++++++++-------------------- 4 files changed, 24 insertions(+), 58 deletions(-) diff --git a/arch/alpha/include/asm/hardirq.h b/arch/alpha/include/asm/hardirq.h index d953e234daa8..88971460fa6c 100644 --- a/arch/alpha/include/asm/hardirq.h +++ b/arch/alpha/include/asm/hardirq.h @@ -14,17 +14,4 @@ typedef struct { void ack_bad_irq(unsigned int irq); -#define HARDIRQ_BITS 12 - -/* - * The hardirq mask has to be large enough to have - * space for potentially nestable IRQ sources in the system - * to nest on a single CPU. On Alpha, interrupts are masked at the CPU - * by IPL as well as at the system level. We only have 8 IPLs (UNIX PALcode) - * so we really only have 8 nestable IRQs, but allow some overhead - */ -#if (1 << HARDIRQ_BITS) < 16 -#error HARDIRQ_BITS is too low! -#endif - #endif /* _ALPHA_HARDIRQ_H */ diff --git a/arch/avr32/include/asm/hardirq.h b/arch/avr32/include/asm/hardirq.h index 267354356f60..015bc75ea798 100644 --- a/arch/avr32/include/asm/hardirq.h +++ b/arch/avr32/include/asm/hardirq.h @@ -20,15 +20,4 @@ void ack_bad_irq(unsigned int irq); #endif /* __ASSEMBLY__ */ -#define HARDIRQ_BITS 12 - -/* - * The hardirq mask has to be large enough to have - * space for potentially all IRQ sources in the system - * nesting on a single CPU: - */ -#if (1 << HARDIRQ_BITS) < NR_IRQS -# error HARDIRQ_BITS is too low! -#endif - #endif /* __ASM_AVR32_HARDIRQ_H */ diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h index 140e495b8e0e..d514cd9edb49 100644 --- a/arch/ia64/include/asm/hardirq.h +++ b/arch/ia64/include/asm/hardirq.h @@ -20,16 +20,6 @@ #define local_softirq_pending() (local_cpu_data->softirq_pending) -#define HARDIRQ_BITS 14 - -/* - * The hardirq mask has to be large enough to have space for potentially all IRQ sources - * in the system nesting on a single CPU: - */ -#if (1 << HARDIRQ_BITS) < NR_IRQS -# error HARDIRQ_BITS is too low! -#endif - extern void __iomem *ipi_base_addr; void ack_bad_irq(unsigned int irq); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f3cf86e1465b..9841221f53f2 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -15,61 +15,61 @@ * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * - * The hardirq count can be overridden per architecture, the default is: + * The hardirq count can in theory reach the same as NR_IRQS. + * In reality, the number of nested IRQS is limited to the stack + * size as well. For archs with over 1000 IRQS it is not practical + * to expect that they will all nest. We give a max of 10 bits for + * hardirq nesting. An arch may choose to give less than 10 bits. + * m68k expects it to be 8. * - * - bits 16-27 are the hardirq count (max # of hardirqs: 4096) - * - ( bit 28 is the PREEMPT_ACTIVE flag. ) + * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) + * - bit 26 is the NMI_MASK + * - bit 28 is the PREEMPT_ACTIVE flag * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x0fff0000 + * HARDIRQ_MASK: 0x03ff0000 + * NMI_MASK: 0x04000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 +#define NMI_BITS 1 -#ifndef HARDIRQ_BITS -#define HARDIRQ_BITS 12 +#define MAX_HARDIRQ_BITS 10 -#ifndef MAX_HARDIRQS_PER_CPU -#define MAX_HARDIRQS_PER_CPU NR_IRQS +#ifndef HARDIRQ_BITS +# define HARDIRQ_BITS MAX_HARDIRQ_BITS #endif -/* - * The hardirq mask has to be large enough to have space for potentially - * all IRQ sources in the system nesting on a single CPU. - */ -#if (1 << HARDIRQ_BITS) < MAX_HARDIRQS_PER_CPU -# error HARDIRQ_BITS is too low! -#endif +#if HARDIRQ_BITS > MAX_HARDIRQ_BITS +#error HARDIRQ_BITS too high! #endif #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define NMI_OFFSET (1UL << NMI_SHIFT) -#if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS)) +#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif -#define NMI_OFFSET (PREEMPT_ACTIVE << 1) - -#if NMI_OFFSET >= 0x80000000 -#error PREEMPT_ACTIVE too high! -#endif - #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK)) +#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ + | NMI_MASK)) /* * Are we doing bottom half or hardware interrupt processing? @@ -82,7 +82,7 @@ /* * Are we in NMI context? */ -#define in_nmi() (preempt_count() & NMI_OFFSET) +#define in_nmi() (preempt_count() & NMI_MASK) #if defined(CONFIG_PREEMPT) # define PREEMPT_INATOMIC_BASE kernel_locked() -- cgit v1.2.3-59-g8ed1b From 45141d4667d208421ca787a3301542b6a5e0b112 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Feb 2009 13:19:48 -0500 Subject: ring-buffer: rename label out_unlock to out_reset Impact: clean up While reviewing the ring buffer code, I thougth I saw a bug with if (!__raw_spin_trylock(&cpu_buffer->lock)) goto out_unlock; But I forgot that we use a variable "lock_taken" that is set if the spinlock is taken, and only unlock it if that variable is set. To avoid further confusion from other reviewers, this patch renames the label out_unlock with out_reset, which is the more appropriate name. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc18b5b9ccb4..f39d7e9a4305 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1017,7 +1017,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (unlikely(in_nmi())) { if (!__raw_spin_trylock(&cpu_buffer->lock)) - goto out_unlock; + goto out_reset; } else __raw_spin_lock(&cpu_buffer->lock); @@ -1030,7 +1030,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* we grabbed the lock before incrementing */ if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_unlock; + goto out_reset; /* * If for some reason, we had an interrupt storm that made @@ -1039,12 +1039,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (unlikely(next_page == commit_page)) { WARN_ON_ONCE(1); - goto out_unlock; + goto out_reset; } if (next_page == head_page) { if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_unlock; + goto out_reset; /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { @@ -1118,7 +1118,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; - out_unlock: + out_reset: /* reset write */ if (tail <= BUF_PAGE_SIZE) local_set(&tail_page->write, tail); -- cgit v1.2.3-59-g8ed1b From 264307520b120593ba63c4a23c58dd5e2ec5e822 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Feb 2009 19:55:48 +0100 Subject: xfs: fix error handling in xfs_log_mount We can't just call xfs_log_unmount_dealloc on any failure because the ail thread which is torn down by xfs_log_unmount_dealloc might not be initialized yet. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reported-by: Lachlan McIlroy --- fs/xfs/xfs_log.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 493c07f6a99a..c8f300897728 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -574,7 +574,7 @@ xfs_log_mount( error = xfs_trans_ail_init(mp); if (error) { cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); - goto error; + goto out_free_log; } mp->m_log->l_ailp = mp->m_ail; @@ -594,20 +594,22 @@ xfs_log_mount( mp->m_flags |= XFS_MOUNT_RDONLY; if (error) { cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); - goto error; + goto out_destroy_ail; } } /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; - /* End mounting message in xfs_log_mount_finish */ return 0; -error: - xfs_log_unmount_dealloc(mp); + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); out: return error; -} /* xfs_log_mount */ +} /* * Finish the recovery of the file system. This is separate from -- cgit v1.2.3-59-g8ed1b From 7c8f7af67de19a7ae33a6fc06764771265b0cc56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Feb 2009 19:56:00 +0100 Subject: xfs: reject swapext ioctl on swapfiles Swapfiles are magic - I/O is directly initialized by the VM without involving the filesystem. Swapping out extents underneath the VM thus can cause severe problems. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index ac96ab9f70a2..e6d839bddbf0 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -79,6 +79,12 @@ xfs_swapext( goto out_put_target_file; } + if (IS_SWAPFILE(file->f_path.dentry->d_inode) || + IS_SWAPFILE(target_file->f_path.dentry->d_inode)) { + error = XFS_ERROR(EINVAL); + goto out_put_target_file; + } + ip = XFS_I(file->f_path.dentry->d_inode); tip = XFS_I(target_file->f_path.dentry->d_inode); -- cgit v1.2.3-59-g8ed1b From 2a7b8df04c11a70105c1abe67d006455d3bdc944 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Feb 2009 14:16:46 -0500 Subject: sched: do not account for NMIs Impact: avoid corruption in system time accounting Martin Schwidefsky told me that there was an issue with NMIs and system accounting. The problem is that the accounting code is not reentrant, and if an NMI goes off after an interrupt it can corrupt the accounting. For now, the best we can do is to treat NMIs like SMIs and they are not accounted for. This patch changes nmi_enter to not call __irq_enter and to do the preempt-count and tracing calls directly. Signed-off-by: Steven Rostedt --- include/linux/hardirq.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 9841221f53f2..faa1cf848bcd 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -175,24 +175,24 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() \ - do { \ - ftrace_nmi_enter(); \ - BUG_ON(in_nmi()); \ - add_preempt_count(NMI_OFFSET); \ - lockdep_off(); \ - rcu_nmi_enter(); \ - __irq_enter(); \ +#define nmi_enter() \ + do { \ + ftrace_nmi_enter(); \ + BUG_ON(in_nmi()); \ + add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + lockdep_off(); \ + rcu_nmi_enter(); \ + trace_hardirq_enter(); \ } while (0) -#define nmi_exit() \ - do { \ - __irq_exit(); \ - rcu_nmi_exit(); \ - lockdep_on(); \ - BUG_ON(!in_nmi()); \ - sub_preempt_count(NMI_OFFSET); \ - ftrace_nmi_exit(); \ +#define nmi_exit() \ + do { \ + trace_hardirq_exit(); \ + rcu_nmi_exit(); \ + lockdep_on(); \ + BUG_ON(!in_nmi()); \ + sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + ftrace_nmi_exit(); \ } while (0) #endif /* LINUX_HARDIRQ_H */ -- cgit v1.2.3-59-g8ed1b From b5f9fd0f8a05c9bafb91a9a85b9110938d8e585b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 11 Feb 2009 13:57:25 -0500 Subject: tracing: convert c/p state power tracer to use tracepoints Convert the c/p state "power" tracer to use tracepoints. Avoids a function call when the tracer is disabled. Signed-off-by: Jason Baron Acked-by: Ingo Molnar Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 + arch/x86/kernel/process.c | 3 + include/trace/power.h | 25 ++--- kernel/trace/trace_power.c | 173 +++++++++++++++++------------ 4 files changed, 119 insertions(+), 84 deletions(-) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7ed925edf4d2..c5d737cdb365 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -70,6 +70,8 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); +DEFINE_TRACE(power_mark); + /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 026819ffcb0c..e0d0fd7ab514 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -19,6 +19,9 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +DEFINE_TRACE(power_start); +DEFINE_TRACE(power_end); + int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; diff --git a/include/trace/power.h b/include/trace/power.h index c7cefbcdaea4..2c733e58e89c 100644 --- a/include/trace/power.h +++ b/include/trace/power.h @@ -2,6 +2,7 @@ #define _TRACE_POWER_H #include +#include enum { POWER_NONE = 0, @@ -18,18 +19,16 @@ struct power_trace { #endif }; -#ifdef CONFIG_POWER_TRACER -extern void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state); -extern void trace_power_end(struct power_trace *it); -#else -static inline void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int state) { } -static inline void trace_power_end(struct power_trace *it) { } -#endif +DECLARE_TRACE(power_start, + TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), + TPARGS(it, type, state)); + +DECLARE_TRACE(power_mark, + TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), + TPARGS(it, type, state)); + +DECLARE_TRACE(power_end, + TPPROTO(struct power_trace *it), + TPARGS(it)); #endif /* _TRACE_POWER_H */ diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index b1d0d087d3a6..91ce672fb037 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -21,15 +21,116 @@ static struct trace_array *power_trace; static int __read_mostly trace_power_enabled; +static void probe_power_start(struct power_trace *it, unsigned int type, + unsigned int level) +{ + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); +} + + +static void probe_power_end(struct power_trace *it) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + preempt_disable(); + it->end = ktime_get(); + data = tr->data[smp_processor_id()]; + + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->state_data = *it; + trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); +} + +static void probe_power_mark(struct power_trace *it, unsigned int type, + unsigned int level) +{ + struct ring_buffer_event *event; + struct trace_power *entry; + struct trace_array_cpu *data; + struct trace_array *tr = power_trace; + + if (!trace_power_enabled) + return; + + memset(it, 0, sizeof(struct power_trace)); + it->state = level; + it->type = type; + it->stamp = ktime_get(); + preempt_disable(); + it->end = it->stamp; + data = tr->data[smp_processor_id()]; + + event = trace_buffer_lock_reserve(tr, TRACE_POWER, + sizeof(*entry), 0, 0); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->state_data = *it; + trace_buffer_unlock_commit(tr, event, 0, 0); + out: + preempt_enable(); +} + +static int tracing_power_register(void) +{ + int ret; + + ret = register_trace_power_start(probe_power_start); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_start\n"); + return ret; + } + ret = register_trace_power_end(probe_power_end); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_end\n"); + goto fail_start; + } + ret = register_trace_power_mark(probe_power_mark); + if (ret) { + pr_info("power trace: Couldn't activate tracepoint" + " probe to trace_power_mark\n"); + goto fail_end; + } + return ret; +fail_end: + unregister_trace_power_end(probe_power_end); +fail_start: + unregister_trace_power_start(probe_power_start); + return ret; +} static void start_power_trace(struct trace_array *tr) { trace_power_enabled = 1; + tracing_power_register(); } static void stop_power_trace(struct trace_array *tr) { trace_power_enabled = 0; + unregister_trace_power_start(probe_power_start); + unregister_trace_power_end(probe_power_end); + unregister_trace_power_mark(probe_power_mark); } @@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr) power_trace = tr; trace_power_enabled = 1; + tracing_power_register(); for_each_cpu(cpu, cpu_possible_mask) tracing_reset(tr, cpu); @@ -95,74 +197,3 @@ static int init_power_trace(void) return register_tracer(&power_tracer); } device_initcall(init_power_trace); - -void trace_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} -EXPORT_SYMBOL_GPL(trace_power_start); - - -void trace_power_end(struct power_trace *it) -{ - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} -EXPORT_SYMBOL_GPL(trace_power_end); - -void trace_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ring_buffer_event *event; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(tr, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - trace_buffer_unlock_commit(tr, event, 0, 0); - out: - preempt_enable(); -} -EXPORT_SYMBOL_GPL(trace_power_mark); -- cgit v1.2.3-59-g8ed1b From a234aa9ecdf47a5461573a21dc0b154278df5ba8 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sat, 14 Feb 2009 09:36:00 +0600 Subject: tracing: fix section mismatch in trace_hw_branches.c The function bts_trace_init() references a variable bts_hotcpu_notifier which is marked as __cpuinitdata. Thus causes section mismatch. This patch fixes it. LD kernel/trace/built-in.o WARNING: kernel/trace/built-in.o(.text+0xc90c): Section mismatch in reference from the function bts_trace_init() to the variable .cpuinit.data:bts_hotcpu_notifier The function bts_trace_init() references the variable __cpuinitdata bts_hotcpu_notifier. This is often because bts_trace_init lacks a __cpuinitdata annotation or the annotation of bts_hotcpu_notifier is wrong. WARNING: kernel/trace/built-in.o(.text+0xc92a): Section mismatch in reference from the function bts_trace_reset() to the variable .cpuinit.data:bts_hotcpu_notifier The function bts_trace_reset() references the variable __cpuinitdata bts_hotcpu_notifier. This is often because bts_trace_reset lacks a __cpuinitdata annotation or the annotation of bts_hotcpu_notifier is wrong. Signed-off-by: Rakib Mullick Cc: markus.t.metzger@gmail.com Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 0794dd33f27b..3561aace075c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -127,7 +127,7 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) +static int __cpuinit bts_trace_init(struct trace_array *tr) { hw_branch_trace = tr; @@ -137,7 +137,7 @@ static int bts_trace_init(struct trace_array *tr) return 0; } -static void bts_trace_reset(struct trace_array *tr) +static void __cpuinit bts_trace_reset(struct trace_array *tr) { bts_trace_stop(tr); unregister_hotcpu_notifier(&bts_hotcpu_notifier); -- cgit v1.2.3-59-g8ed1b From 0c75a3ed633419d75d823d5dcb05d42924c6ae61 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Feb 2009 11:21:52 -0500 Subject: ftrace: state that all functions are enabled in set_ftrace_filter Impact: clean up, make set_ftrace_filter less confusing The set_ftrace_filter shows only the functions that will be traced. But when it is empty, it will trace all functions. This can be a bit confusing. This patch makes set_ftrace_filter show: #### all functions enabled #### When all functions will be traced, and we do not filter only a select few. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1796e018fbff..369fb78bd4ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -773,6 +773,7 @@ enum { FTRACE_ITER_CONT = (1 << 1), FTRACE_ITER_NOTRACE = (1 << 2), FTRACE_ITER_FAILURES = (1 << 3), + FTRACE_ITER_PRINTALL = (1 << 4), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -794,6 +795,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; + if (iter->flags & FTRACE_ITER_PRINTALL) + return NULL; + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); retry: @@ -834,6 +838,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) struct ftrace_iterator *iter = m->private; void *p = NULL; + /* + * For set_ftrace_filter reading, if we have the filter + * off, we can short cut and just print out that all + * functions are enabled. + */ + if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { + if (*pos > 0) + return NULL; + iter->flags |= FTRACE_ITER_PRINTALL; + (*pos)++; + return iter; + } + if (*pos > 0) { if (iter->idx < 0) return p; @@ -852,9 +869,15 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { + struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; + if (iter->flags & FTRACE_ITER_PRINTALL) { + seq_printf(m, "#### all functions enabled ####\n"); + return 0; + } + if (!rec) return 0; -- cgit v1.2.3-59-g8ed1b From 265c831cb03d533cbe159af45798ac9fef534260 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Feb 2009 12:43:56 -0500 Subject: ftrace: add do_for_each_ftrace_rec and while_for_each_ftrace_rec Impact: clean up To iterate over all the functions that dynamic trace knows about it requires two for loops. One to iterate over the pages and the other to iterate over the records within the page. There are several duplications of these loops in ftrace.c. This patch creates the macros do_for_each_ftrace_rec and while_for_each_ftrace_rec to handle this logic, and removes the duplicate code. While making this change, I also discovered and fixed a small bug that one of the iterations should exit the loop after it found the record it was searching for. This used a break when it should have used a goto, since there were two loops it needed to break out from. No real harm was done by this bug since it would only continue to search the other records, and the code was in a slow path anyway. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 208 ++++++++++++++++++++++++-------------------------- 1 file changed, 101 insertions(+), 107 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 369fb78bd4ab..fed1ebc3a133 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -297,6 +297,19 @@ static struct ftrace_page *ftrace_pages; static struct dyn_ftrace *ftrace_free_records; +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } #ifdef CONFIG_KPROBES @@ -341,7 +354,6 @@ void ftrace_release(void *start, unsigned long size) struct ftrace_page *pg; unsigned long s = (unsigned long)start; unsigned long e = s + size; - int i; if (ftrace_disabled || !start) return; @@ -349,14 +361,11 @@ void ftrace_release(void *start, unsigned long size) /* should not be called from interrupt context */ spin_lock(&ftrace_lock); - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } while_for_each_ftrace_rec(); - if ((rec->ip >= s) && (rec->ip < e)) - ftrace_free_rec(rec); - } - } spin_unlock(&ftrace_lock); } @@ -523,41 +532,37 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) static void ftrace_replace_code(int enable) { - int i, failed; + int failed; struct dyn_ftrace *rec; struct ftrace_page *pg; - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; - - /* - * Skip over free records and records that have - * failed. - */ - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED) - continue; - - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } + do_for_each_ftrace_rec(pg, rec) { + /* + * Skip over free records and records that have + * failed. + */ + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED) + continue; - failed = __ftrace_replace_code(rec, enable); - if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { - rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else - ftrace_bug(failed, rec->ip); - } + /* ignore updates to this record's mcount site */ + if (get_kprobe((void *)rec->ip)) { + freeze_record(rec); + continue; + } else { + unfreeze_record(rec); } - } + + failed = __ftrace_replace_code(rec, enable); + if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + rec->flags |= FTRACE_FL_FAILED; + if ((system_state == SYSTEM_BOOTING) || + !core_kernel_text(rec->ip)) { + ftrace_free_rec(rec); + } else + ftrace_bug(failed, rec->ip); + } + } while_for_each_ftrace_rec(); } static int @@ -956,22 +961,17 @@ static void ftrace_filter_reset(int enable) struct ftrace_page *pg; struct dyn_ftrace *rec; unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned i; /* should not be called from interrupt context */ spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; - pg = ftrace_pages_start; - while (pg) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; - if (rec->flags & FTRACE_FL_FAILED) - continue; - rec->flags &= ~type; - } - pg = pg->next; - } + do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_FAILED) + continue; + rec->flags &= ~type; + } while_for_each_ftrace_rec(); + spin_unlock(&ftrace_lock); } @@ -1094,44 +1094,39 @@ ftrace_match(unsigned char *buff, int len, int enable) spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; - pg = ftrace_pages_start; - while (pg) { - for (i = 0; i < pg->index; i++) { - int matched = 0; - char *ptr; - - rec = &pg->records[i]; - if (rec->flags & FTRACE_FL_FAILED) - continue; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - switch (type) { - case MATCH_FULL: - if (strcmp(str, buff) == 0) - matched = 1; - break; - case MATCH_FRONT_ONLY: - if (memcmp(str, buff, match) == 0) - matched = 1; - break; - case MATCH_MIDDLE_ONLY: - if (strstr(str, search)) - matched = 1; - break; - case MATCH_END_ONLY: - ptr = strstr(str, search); - if (ptr && (ptr[search_len] == 0)) - matched = 1; - break; - } - if (matched) { - if (not) - rec->flags &= ~flag; - else - rec->flags |= flag; - } + do_for_each_ftrace_rec(pg, rec) { + int matched = 0; + char *ptr; + + if (rec->flags & FTRACE_FL_FAILED) + continue; + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, buff) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (memcmp(str, buff, match) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, search)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, search); + if (ptr && (ptr[search_len] == 0)) + matched = 1; + break; } - pg = pg->next; - } + if (matched) { + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; + } + } while_for_each_ftrace_rec(); spin_unlock(&ftrace_lock); } @@ -1452,7 +1447,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) struct dyn_ftrace *rec; struct ftrace_page *pg; int found = 0; - int i, j; + int j; if (ftrace_disabled) return -ENODEV; @@ -1460,27 +1455,26 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) /* should not be called from interrupt context */ spin_lock(&ftrace_lock); - for (pg = ftrace_pages_start; pg; pg = pg->next) { - for (i = 0; i < pg->index; i++) { - rec = &pg->records[i]; - - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) - continue; - - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - if (strcmp(str, buffer) == 0) { - found = 1; - for (j = 0; j < idx; j++) - if (array[j] == rec->ip) { - found = 0; - break; - } - if (found) - array[idx] = rec->ip; - break; - } + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) + continue; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + if (strcmp(str, buffer) == 0) { + /* Return 1 if we add it to the array */ + found = 1; + for (j = 0; j < idx; j++) + if (array[j] == rec->ip) { + found = 0; + break; + } + if (found) + array[idx] = rec->ip; + goto out; } - } + } while_for_each_ftrace_rec(); + out: spin_unlock(&ftrace_lock); return found ? 0 : -EINVAL; -- cgit v1.2.3-59-g8ed1b From 7f24b31b01a271b62346d9df084b029e48612163 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Feb 2009 14:37:33 -0500 Subject: ftrace: rename ftrace_match to ftrace_match_records Impact: clean up ftrace_match is too generic of a name. What it really does is search all records and matches the records with the given string, and either sets or unsets the functions to be traced depending on if the parameter 'enable' is set or not. This allows us to make another function called ftrace_match that can be used to test a single record. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fed1ebc3a133..f397d7adb62e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1054,7 +1054,7 @@ enum { }; static void -ftrace_match(unsigned char *buff, int len, int enable) +ftrace_match_records(unsigned char *buff, int len, int enable) { char str[KSYM_SYMBOL_LEN]; char *search = NULL; @@ -1197,7 +1197,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (isspace(ch)) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx, enable); + ftrace_match_records(iter->buffer, iter->buffer_idx, enable); iter->buffer_idx = 0; } else iter->flags |= FTRACE_ITER_CONT; @@ -1236,7 +1236,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) if (reset) ftrace_filter_reset(enable); if (buf) - ftrace_match(buf, len, enable); + ftrace_match_records(buf, len, enable); mutex_unlock(&ftrace_regex_lock); } @@ -1286,7 +1286,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) if (iter->buffer_idx) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match(iter->buffer, iter->buffer_idx, enable); + ftrace_match_records(iter->buffer, iter->buffer_idx, enable); } mutex_lock(&ftrace_sysctl_lock); -- cgit v1.2.3-59-g8ed1b From 9f4801e30ad291e27284e873696da1ead92d68fa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Feb 2009 15:56:43 -0500 Subject: ftrace: break up ftrace_match_records into smaller components Impact: clean up ftrace_match_records does a lot of things that other features can use. This patch breaks up ftrace_match_records and pulls out ftrace_setup_glob and ftrace_match_record. ftrace_setup_glob prepares a simple glob expression for use with ftrace_match_record. ftrace_match_record compares a single record with a glob type. Breaking this up will allow for more features to run on individual records. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 115 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 40 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f397d7adb62e..fcec31323a10 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1053,79 +1053,114 @@ enum { MATCH_END_ONLY, }; -static void -ftrace_match_records(unsigned char *buff, int len, int enable) +/* + * (static function - no need for kernel doc) + * + * Pass in a buffer containing a glob and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +static int +ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not) { - char str[KSYM_SYMBOL_LEN]; - char *search = NULL; - struct ftrace_page *pg; - struct dyn_ftrace *rec; int type = MATCH_FULL; - unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned i, match = 0, search_len = 0; - int not = 0; + int i; if (buff[0] == '!') { - not = 1; + *not = 1; buff++; len--; - } + } else + *not = 0; + + *search = buff; for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { - search = buff + i + 1; + *search = buff + 1; type = MATCH_END_ONLY; - search_len = len - (i + 1); } else { - if (type == MATCH_END_ONLY) { + if (type == MATCH_END_ONLY) type = MATCH_MIDDLE_ONLY; - } else { - match = i; + else type = MATCH_FRONT_ONLY; - } buff[i] = 0; break; } } } + return type; +} + +static int +ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +{ + char str[KSYM_SYMBOL_LEN]; + int matched = 0; + char *ptr; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + switch (type) { + case MATCH_FULL: + if (strcmp(str, regex) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (strncmp(str, regex, len) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, regex)) + matched = 1; + break; + case MATCH_END_ONLY: + ptr = strstr(str, regex); + if (ptr && (ptr[len] == 0)) + matched = 1; + break; + } + + return matched; +} + +static void ftrace_match_records(char *buff, int len, int enable) +{ + char *search; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned search_len; + int not; + + type = ftrace_setup_glob(buff, len, &search, ¬); + + search_len = strlen(search); + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; do_for_each_ftrace_rec(pg, rec) { - int matched = 0; - char *ptr; if (rec->flags & FTRACE_FL_FAILED) continue; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - switch (type) { - case MATCH_FULL: - if (strcmp(str, buff) == 0) - matched = 1; - break; - case MATCH_FRONT_ONLY: - if (memcmp(str, buff, match) == 0) - matched = 1; - break; - case MATCH_MIDDLE_ONLY: - if (strstr(str, search)) - matched = 1; - break; - case MATCH_END_ONLY: - ptr = strstr(str, search); - if (ptr && (ptr[search_len] == 0)) - matched = 1; - break; - } - if (matched) { + + if (ftrace_match_record(rec, search, search_len, type)) { if (not) rec->flags &= ~flag; else rec->flags |= flag; } + } while_for_each_ftrace_rec(); spin_unlock(&ftrace_lock); } -- cgit v1.2.3-59-g8ed1b From 64e7c440618998fd69eee6ab490b042d12248021 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Feb 2009 17:08:48 -0500 Subject: ftrace: add module command function filter selection This patch adds a "command" syntax to the function filtering files: /debugfs/tracing/set_ftrace_filter /debugfs/tracing/set_ftrace_notrace Of the format: :: The command is optional, and dependent on the command, so are the parameters. echo do_fork > set_ftrace_filter Will only trace 'do_fork'. echo 'sched_*' > set_ftrace_filter Will only trace functions starting with the letters 'sched_'. echo '*:mod:ext3' > set_ftrace_filter Will trace only the ext3 module functions. echo '*write*:mod:ext3' > set_ftrace_notrace Will prevent the ext3 functions with the letters 'write' in the name from being traced. echo '!*_allocate:mod:ext3' > set_ftrace_filter Will remove the functions in ext3 that end with the letters '_allocate' from the ftrace filter. Although this patch implements the 'command' format, only the 'mod' command is supported. More commands to follow. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fcec31323a10..9e60ae423af9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1067,7 +1067,7 @@ enum { * 0 otherwise. */ static int -ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not) +ftrace_setup_glob(char *buff, int len, char **search, int *not) { int type = MATCH_FULL; int i; @@ -1100,14 +1100,11 @@ ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not) return type; } -static int -ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +static int ftrace_match(char *str, char *regex, int len, int type) { - char str[KSYM_SYMBOL_LEN]; int matched = 0; char *ptr; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); switch (type) { case MATCH_FULL: if (strcmp(str, regex) == 0) @@ -1131,6 +1128,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) return matched; } +static int +ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) +{ + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + return ftrace_match(str, regex, len, type); +} + static void ftrace_match_records(char *buff, int len, int enable) { char *search; @@ -1165,6 +1171,100 @@ static void ftrace_match_records(char *buff, int len, int enable) spin_unlock(&ftrace_lock); } +static int +ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) +{ + char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (len) + return ftrace_match(str, regex, len, type); + else + return 1; +} + +static void ftrace_match_module_records(char *buff, char *mod, int enable) +{ + char *search = buff; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + unsigned search_len = 0; + int not = 0; + + /* blank or '*' mean the same */ + if (strcmp(buff, "*") == 0) + buff[0] = 0; + + /* handle the case of 'dont filter this module' */ + if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { + buff[0] = 0; + not = 1; + } + + if (strlen(buff)) { + type = ftrace_setup_glob(buff, strlen(buff), &search, ¬); + search_len = strlen(search); + } + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + if (enable) + ftrace_filtered = 1; + + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_FAILED) + continue; + + if (ftrace_match_module_record(rec, mod, + search, search_len, type)) { + if (not) + rec->flags &= ~flag; + else + rec->flags |= flag; + } + + } while_for_each_ftrace_rec(); + spin_unlock(&ftrace_lock); +} + +static int ftrace_process_regex(char *buff, int len, int enable) +{ + char *func, *mod, *command, *next = buff; + + func = strsep(&next, ":"); + + if (!next) { + ftrace_match_records(func, len, enable); + return 0; + } + + /* command fonud */ + + command = strsep(&next, ":"); + + if (strcmp(command, "mod") == 0) { + /* only match modules */ + if (!next) + return -EINVAL; + + mod = strsep(&next, ":"); + ftrace_match_module_records(func, mod, enable); + return 0; + } + + return -EINVAL; +} + static ssize_t ftrace_regex_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos, int enable) @@ -1232,7 +1332,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (isspace(ch)) { iter->filtered++; iter->buffer[iter->buffer_idx] = 0; - ftrace_match_records(iter->buffer, iter->buffer_idx, enable); + ret = ftrace_process_regex(iter->buffer, + iter->buffer_idx, enable); + if (ret) + goto out; iter->buffer_idx = 0; } else iter->flags |= FTRACE_ITER_CONT; -- cgit v1.2.3-59-g8ed1b From e68746a271eb3393a2183840be9e903caddf765b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Feb 2009 20:53:42 -0500 Subject: ftrace: enable filtering only when a function is filtered on Impact: fix to prevent empty set_ftrace_filter and no ftrace output The function filter is used to only trace a given set of functions. The filter is enabled when a function name is echoed into the set_ftrace_filter file. But if the name has a typo and the function is not found, the filter is enabled, but no function is listed. This makes a confusing situation where set_ftrace_filter is empty but no functions ever get enabled for tracing. For example: # cat /debug/tracing/set_ftrace_filter #### all functions enabled #### # echo bad_name > set_ftrace_filter # cat /debug/tracing/set_ftrace_filter # echo function > current_tracer # cat trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | This patch changes that to only enable filtering if a function is set to be filtered on. Now, the filter is not enabled if a bad name is echoed into set_ftrace_filter. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e60ae423af9..340f88b68d9e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1153,8 +1153,6 @@ static void ftrace_match_records(char *buff, int len, int enable) /* should not be called from interrupt context */ spin_lock(&ftrace_lock); - if (enable) - ftrace_filtered = 1; do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_FAILED) @@ -1166,7 +1164,12 @@ static void ftrace_match_records(char *buff, int len, int enable) else rec->flags |= flag; } - + /* + * Only enable filtering if we have a function that + * is filtered on. + */ + if (enable && (rec->flags & FTRACE_FL_FILTER)) + ftrace_filtered = 1; } while_for_each_ftrace_rec(); spin_unlock(&ftrace_lock); } @@ -1217,9 +1220,6 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) /* should not be called from interrupt context */ spin_lock(&ftrace_lock); - if (enable) - ftrace_filtered = 1; - do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_FAILED) @@ -1232,6 +1232,8 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) else rec->flags |= flag; } + if (enable && (rec->flags & FTRACE_FL_FILTER)) + ftrace_filtered = 1; } while_for_each_ftrace_rec(); spin_unlock(&ftrace_lock); -- cgit v1.2.3-59-g8ed1b From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 00:40:25 -0500 Subject: ftrace: add command interface for function selection Allow for other tracers to add their own commands for function selection. This interface gives a trace the ability to name a command for function selection. Right now it is pretty limited in what it offers, but this is a building step for more features. The :mod: command is converted to this interface and also serves as a template for other implementations. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 16 ++++++++ kernel/trace/ftrace.c | 106 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 111 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 106b7909d500..f0a0ecc63b5c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -95,6 +95,13 @@ stack_trace_sysctl(struct ctl_table *table, int write, loff_t *ppos); #endif +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(char *func, char *cmd, + char *params, int enable); +}; + #ifdef CONFIG_DYNAMIC_FTRACE /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include @@ -119,6 +126,9 @@ struct dyn_ftrace { int ftrace_force_update(void); void ftrace_set_filter(unsigned char *buf, int len, int reset); +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + /* defined in arch */ extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void *data); @@ -202,6 +212,12 @@ extern void ftrace_enable_daemon(void); # define ftrace_disable_daemon() do { } while (0) # define ftrace_enable_daemon() do { } while (0) static inline void ftrace_release(void *start, unsigned long size) { } +static inline int register_ftrace_command(struct ftrace_func_command *cmd) +{ +} +static inline int unregister_ftrace_command(char *cmd_name) +{ +} #endif /* CONFIG_DYNAMIC_FTRACE */ /* totally disable ftrace - can not re-enable after this */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 340f88b68d9e..45a44c402566 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) spin_unlock(&ftrace_lock); } +/* + * We register the module command as a template to show others how + * to register the a command as well. + */ + +static int +ftrace_mod_callback(char *func, char *cmd, char *param, int enable) +{ + char *mod; + + /* + * cmd == 'mod' because we only registered this func + * for the 'mod' ftrace_func_command. + * But if you register one func with multiple commands, + * you can tell which command was used by the cmd + * parameter. + */ + + /* we must have a module name */ + if (!param) + return -EINVAL; + + mod = strsep(¶m, ":"); + if (!strlen(mod)) + return -EINVAL; + + ftrace_match_module_records(func, mod, enable); + return 0; +} + +static struct ftrace_func_command ftrace_mod_cmd = { + .name = "mod", + .func = ftrace_mod_callback, +}; + +static int __init ftrace_mod_cmd_init(void) +{ + return register_ftrace_command(&ftrace_mod_cmd); +} +device_initcall(ftrace_mod_cmd_init); + +static LIST_HEAD(ftrace_commands); +static DEFINE_MUTEX(ftrace_cmd_mutex); + +int register_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p; + int ret = 0; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &ftrace_commands); + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +int unregister_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + static int ftrace_process_regex(char *buff, int len, int enable) { - char *func, *mod, *command, *next = buff; + struct ftrace_func_command *p; + char *func, *command, *next = buff; + int ret = -EINVAL; func = strsep(&next, ":"); @@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable) return 0; } - /* command fonud */ + /* command found */ command = strsep(&next, ":"); - if (strcmp(command, "mod") == 0) { - /* only match modules */ - if (!next) - return -EINVAL; - - mod = strsep(&next, ":"); - ftrace_match_module_records(func, mod, enable); - return 0; + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(func, command, next, enable); + goto out_unlock; + } } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); - return -EINVAL; + return ret; } static ssize_t -- cgit v1.2.3-59-g8ed1b From 52baf11922db7377b580dd5448a07f71c6a35611 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 01:15:39 -0500 Subject: ftrace: convert ftrace_lock from a spinlock to mutex Impact: clean up The older versions of ftrace required doing the ftrace list search under atomic context. Now all the calls are in non-atomic context. There is no reason to keep the ftrace_lock as a spinlock. This patch converts it to a mutex. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45a44c402566..4771732037ee 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -61,7 +61,7 @@ int function_trace_stop; */ static int ftrace_disabled __read_mostly; -static DEFINE_SPINLOCK(ftrace_lock); +static DEFINE_MUTEX(ftrace_lock); static DEFINE_MUTEX(ftrace_sysctl_lock); static DEFINE_MUTEX(ftrace_start_lock); @@ -134,8 +134,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) static int __register_ftrace_function(struct ftrace_ops *ops) { - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); ops->next = ftrace_list; /* @@ -172,7 +171,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) #endif } - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); return 0; } @@ -182,8 +181,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) struct ftrace_ops **p; int ret = 0; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); /* * If we are removing the last function, then simply point @@ -224,7 +222,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } out: - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); return ret; } @@ -233,8 +231,7 @@ static void ftrace_update_pid_func(void) { ftrace_func_t func; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); if (ftrace_trace_function == ftrace_stub) goto out; @@ -256,7 +253,7 @@ static void ftrace_update_pid_func(void) #endif out: - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); } #ifdef CONFIG_DYNAMIC_FTRACE @@ -358,15 +355,12 @@ void ftrace_release(void *start, unsigned long size) if (ftrace_disabled || !start) return; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); - + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e)) ftrace_free_rec(rec); } while_for_each_ftrace_rec(); - - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); } static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) @@ -803,8 +797,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (iter->flags & FTRACE_ITER_PRINTALL) return NULL; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -833,7 +826,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); return rec; } @@ -962,8 +955,7 @@ static void ftrace_filter_reset(int enable) struct dyn_ftrace *rec; unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; do_for_each_ftrace_rec(pg, rec) { @@ -971,8 +963,7 @@ static void ftrace_filter_reset(int enable) continue; rec->flags &= ~type; } while_for_each_ftrace_rec(); - - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); } static int @@ -1151,8 +1142,7 @@ static void ftrace_match_records(char *buff, int len, int enable) search_len = strlen(search); - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_FAILED) @@ -1171,7 +1161,7 @@ static void ftrace_match_records(char *buff, int len, int enable) if (enable && (rec->flags & FTRACE_FL_FILTER)) ftrace_filtered = 1; } while_for_each_ftrace_rec(); - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); } static int @@ -1218,8 +1208,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) search_len = strlen(search); } - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_FAILED) @@ -1236,7 +1225,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable) ftrace_filtered = 1; } while_for_each_ftrace_rec(); - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); } /* @@ -1676,9 +1665,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) if (ftrace_disabled) return -ENODEV; - /* should not be called from interrupt context */ - spin_lock(&ftrace_lock); - + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) @@ -1699,7 +1686,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer) } } while_for_each_ftrace_rec(); out: - spin_unlock(&ftrace_lock); + mutex_unlock(&ftrace_lock); return found ? 0 : -EINVAL; } -- cgit v1.2.3-59-g8ed1b From e6ea44e9b4c12325337cd1c06103cd515a1c02b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 01:42:44 -0500 Subject: ftrace: consolidate mutexes Impact: clean up Now that ftrace_lock is a mutex, there is no reason to have three different mutexes protecting similar data. All the mutex paths are not in hot paths, so having a mutex to cover more data is not a problem. This patch removes the ftrace_sysctl_lock and ftrace_start_lock and uses the ftrace_lock to protect the locations that were protected by these locks. By doing so, this change also removes some of the lock nesting that was taking place. There are still more mutexes in ftrace.c that can probably be consolidated, but they can be dealt with later. We need to be careful about the way the locks are nested, and by consolidating, we can cause a recursive deadlock. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 68 ++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 47 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4771732037ee..157d4f68b0e0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ int function_trace_stop; static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static DEFINE_MUTEX(ftrace_sysctl_lock); -static DEFINE_MUTEX(ftrace_start_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -134,8 +132,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) static int __register_ftrace_function(struct ftrace_ops *ops) { - mutex_lock(&ftrace_lock); - ops->next = ftrace_list; /* * We are entering ops into the ftrace_list but another @@ -171,17 +167,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) #endif } - mutex_unlock(&ftrace_lock); - return 0; } static int __unregister_ftrace_function(struct ftrace_ops *ops) { struct ftrace_ops **p; - int ret = 0; - - mutex_lock(&ftrace_lock); /* * If we are removing the last function, then simply point @@ -190,17 +181,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_list == ops && ops->next == &ftrace_list_end) { ftrace_trace_function = ftrace_stub; ftrace_list = &ftrace_list_end; - goto out; + return 0; } for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) if (*p == ops) break; - if (*p != ops) { - ret = -1; - goto out; - } + if (*p != ops) + return -1; *p = (*p)->next; @@ -221,10 +210,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } } - out: - mutex_unlock(&ftrace_lock); - - return ret; + return 0; } static void ftrace_update_pid_func(void) @@ -622,13 +608,10 @@ static void ftrace_startup(int command) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftrace_start_lock); ftrace_start_up++; command |= FTRACE_ENABLE_CALLS; ftrace_startup_enable(command); - - mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown(int command) @@ -636,7 +619,6 @@ static void ftrace_shutdown(int command) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftrace_start_lock); ftrace_start_up--; if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; @@ -647,11 +629,9 @@ static void ftrace_shutdown(int command) } if (!command || !ftrace_enabled) - goto out; + return; ftrace_run_update_code(command); - out: - mutex_unlock(&ftrace_start_lock); } static void ftrace_startup_sysctl(void) @@ -661,7 +641,6 @@ static void ftrace_startup_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ @@ -669,7 +648,6 @@ static void ftrace_startup_sysctl(void) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftrace_start_lock); } static void ftrace_shutdown_sysctl(void) @@ -679,13 +657,11 @@ static void ftrace_shutdown_sysctl(void) if (unlikely(ftrace_disabled)) return; - mutex_lock(&ftrace_start_lock); /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); - mutex_unlock(&ftrace_start_lock); } static cycle_t ftrace_update_time; @@ -1502,12 +1478,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) ftrace_match_records(iter->buffer, iter->buffer_idx, enable); } - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftrace_start_lock); + mutex_lock(&ftrace_lock); if (ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); - mutex_unlock(&ftrace_start_lock); - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); kfree(iter); mutex_unlock(&ftrace_regex_lock); @@ -1824,7 +1798,7 @@ static int ftrace_convert_nops(struct module *mod, unsigned long addr; unsigned long flags; - mutex_lock(&ftrace_start_lock); + mutex_lock(&ftrace_lock); p = start; while (p < end) { addr = ftrace_call_adjust(*p++); @@ -1843,7 +1817,7 @@ static int ftrace_convert_nops(struct module *mod, local_irq_save(flags); ftrace_update_code(mod); local_irq_restore(flags); - mutex_unlock(&ftrace_start_lock); + mutex_unlock(&ftrace_lock); return 0; } @@ -2016,7 +1990,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - mutex_lock(&ftrace_start_lock); + mutex_lock(&ftrace_lock); if (val < 0) { /* disable pid tracing */ if (!ftrace_pid_trace) @@ -2055,7 +2029,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, ftrace_startup_enable(0); out: - mutex_unlock(&ftrace_start_lock); + mutex_unlock(&ftrace_lock); return cnt; } @@ -2118,12 +2092,12 @@ int register_ftrace_function(struct ftrace_ops *ops) if (unlikely(ftrace_disabled)) return -1; - mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftrace_lock); ret = __register_ftrace_function(ops); ftrace_startup(0); - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); return ret; } @@ -2137,10 +2111,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftrace_lock); ret = __unregister_ftrace_function(ops); ftrace_shutdown(0); - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); return ret; } @@ -2155,7 +2129,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftrace_lock); ret = proc_dointvec(table, write, file, buffer, lenp, ppos); @@ -2184,7 +2158,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, } out: - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); return ret; } @@ -2296,7 +2270,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, { int ret = 0; - mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftrace_lock); ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); @@ -2314,13 +2288,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_startup(FTRACE_START_FUNC_RET); out: - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); return ret; } void unregister_ftrace_graph(void) { - mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftrace_lock); atomic_dec(&ftrace_graph_active); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; @@ -2328,7 +2302,7 @@ void unregister_ftrace_graph(void) ftrace_shutdown(FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); - mutex_unlock(&ftrace_sysctl_lock); + mutex_unlock(&ftrace_lock); } /* Allocate a return stack for newly created task */ -- cgit v1.2.3-59-g8ed1b From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 15:29:06 -0500 Subject: ftrace: trace different functions with a different tracer Impact: new feature Currently, the function tracer only gives you an ability to hook a tracer to all functions being traced. The dynamic function trace allows you to pick and choose which of those functions will be traced, but all functions being traced will call all tracers that registered with the function tracer. This patch adds a new feature that allows a tracer to hook to specific functions, even when all functions are being traced. It allows for different functions to call different tracer hooks. The way this is accomplished is by a special function that will hook to the function tracer and will set up a hash table knowing which tracer hook to call with which function. This is the most general and easiest method to accomplish this. Later, an arch may choose to supply their own method in changing the mcount call of a function to call a different tracer. But that will be an exercise for the future. To register a function: struct ftrace_hook_ops { void (*func)(unsigned long ip, unsigned long parent_ip, void **data); int (*callback)(unsigned long ip, void **data); void (*free)(void **data); }; int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, void *data); glob is a simple glob to search for the functions to hook. ops is a pointer to the operations (listed below) data is the default data to be passed to the hook functions when traced ops: func is the hook function to call when the functions are traced callback is a callback function that is called when setting up the hash. That is, if the tracer needs to do something special for each function, that is being traced, and wants to give each function its own data. The address of the entry data is passed to this callback, so that the callback may wish to update the entry to whatever it would like. free is a callback for when the entry is freed. In case the tracer allocated any data, it is give the chance to free it. To unregister we have three functions: void unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, void *data) This will unregister all hooks that match glob, point to ops, and have its data matching data. (note, if glob is NULL, blank or '*', all functions will be tested). void unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops) This will unregister all functions matching glob that has an entry pointing to ops. void unregister_ftrace_function_hook_all(char *glob) This simply unregisters all funcs. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 18 ++++ kernel/trace/ftrace.c | 247 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f0a0ecc63b5c..13918c4400ad 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -106,6 +106,24 @@ struct ftrace_func_command { /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include +struct ftrace_hook_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + void **data); + int (*callback)(unsigned long ip, void **data); + void (*free)(void **data); +}; + +extern int +register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, + void *data); +extern void +unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, + void *data); +extern void +unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops); +extern void unregister_ftrace_function_hook_all(char *glob); + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 157d4f68b0e0..0b80e325f296 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void) } device_initcall(ftrace_mod_cmd_init); +#define FTRACE_HASH_BITS 7 +#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_hook { + struct hlist_node node; + struct ftrace_hook_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct rcu_head rcu; +}; + +static void +function_trace_hook_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_func_hook *entry; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + int resched; + + key = hash_long(ip, FTRACE_HASH_BITS); + + hhd = &ftrace_func_hash[key]; + + if (hlist_empty(hhd)) + return; + + /* + * Disable preemption for these calls to prevent a RCU grace + * period. This syncs the hash iteration and freeing of items + * on the hash. rcu_read_lock is too dangerous here. + */ + resched = ftrace_preempt_disable(); + hlist_for_each_entry_rcu(entry, n, hhd, node) { + if (entry->ip == ip) + entry->ops->func(ip, parent_ip, &entry->data); + } + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_hook_ops __read_mostly = +{ + .func = function_trace_hook_call, +}; + +static int ftrace_hook_registered; + +static void __enable_ftrace_function_hook(void) +{ + int i; + + if (ftrace_hook_registered) + return; + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + break; + } + /* Nothing registered? */ + if (i == FTRACE_FUNC_HASHSIZE) + return; + + __register_ftrace_function(&trace_hook_ops); + ftrace_startup(0); + ftrace_hook_registered = 1; +} + +static void __disable_ftrace_function_hook(void) +{ + int i; + + if (!ftrace_hook_registered) + return; + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + return; + } + + /* no more funcs left */ + __unregister_ftrace_function(&trace_hook_ops); + ftrace_shutdown(0); + ftrace_hook_registered = 0; +} + + +static void ftrace_free_entry_rcu(struct rcu_head *rhp) +{ + struct ftrace_func_hook *entry = + container_of(rhp, struct ftrace_func_hook, rcu); + + if (entry->ops->free) + entry->ops->free(&entry->data); + kfree(entry); +} + + +int +register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, + void *data) +{ + struct ftrace_func_hook *entry; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long key; + int type, len, not; + int count = 0; + char *search; + + type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function hooks */ + if (WARN_ON(not)) + return -EINVAL; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_FAILED) + continue; + + if (!ftrace_match_record(rec, search, len, type)) + continue; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + /* If we did not hook to any, then return error */ + if (!count) + count = -ENOMEM; + goto out_unlock; + } + + count++; + + entry->data = data; + + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (ops->callback) { + if (ops->callback(rec->ip, &entry->data) < 0) { + /* caller does not like this func */ + kfree(entry); + continue; + } + } + + entry->ops = ops; + entry->ip = rec->ip; + + key = hash_long(entry->ip, FTRACE_HASH_BITS); + hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + + } while_for_each_ftrace_rec(); + __enable_ftrace_function_hook(); + + out_unlock: + mutex_unlock(&ftrace_lock); + + return count; +} + +enum { + HOOK_TEST_FUNC = 1, + HOOK_TEST_DATA = 2 +}; + +static void +__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, + void *data, int flags) +{ + struct ftrace_func_hook *entry; + struct hlist_node *n, *tmp; + char str[KSYM_SYMBOL_LEN]; + int type = MATCH_FULL; + int i, len = 0; + char *search; + + if (glob && (strcmp(glob, "*") || !strlen(glob))) + glob = NULL; + else { + int not; + + type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function hooks */ + if (WARN_ON(not)) + return; + } + + mutex_lock(&ftrace_lock); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + + hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + + /* break up if statements for readability */ + if ((flags & HOOK_TEST_FUNC) && entry->ops != ops) + continue; + + if ((flags & HOOK_TEST_DATA) && entry->data != data) + continue; + + /* do this last, since it is the most expensive */ + if (glob) { + kallsyms_lookup(entry->ip, NULL, NULL, + NULL, str); + if (!ftrace_match(str, glob, len, type)) + continue; + } + + hlist_del(&entry->node); + call_rcu(&entry->rcu, ftrace_free_entry_rcu); + } + } + __disable_ftrace_function_hook(); + mutex_unlock(&ftrace_lock); +} + +void +unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, + void *data) +{ + __unregister_ftrace_function_hook(glob, ops, data, + HOOK_TEST_FUNC | HOOK_TEST_DATA); +} + +void +unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops) +{ + __unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC); +} + +void unregister_ftrace_function_hook_all(char *glob) +{ + __unregister_ftrace_function_hook(glob, NULL, NULL, 0); +} + static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -- cgit v1.2.3-59-g8ed1b From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 19:17:02 -0500 Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled This patch adds the tracing_is_on() interface to tell if the ring buffer is turned on or not. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8e6646a54acf..f5e793d69bd3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -128,10 +128,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); void tracing_on(void); void tracing_off(void); void tracing_off_permanent(void); +int tracing_is_on(void); #else static inline void tracing_on(void) { } static inline void tracing_off(void) { } static inline void tracing_off_permanent(void) { } +static inline int tracing_is_on(void) { return 0; } #endif void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2b4626ce95d6..8f19f1aa42b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -98,6 +98,15 @@ void tracing_off_permanent(void) set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return ring_buffer_flags == RB_BUFFERS_ON; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + #include "trace.h" /* Up this if you want to test the TIME_EXTENTS and normalization */ -- cgit v1.2.3-59-g8ed1b From 23b4ff3aa479c9e3bb23cb6b2d0a97878399784a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 14 Feb 2009 19:04:24 -0500 Subject: ftrace: add traceon traceoff commands to enable/disable the buffers This patch adds the new function selection commands traceon and traceoff. traceon sets the function to enable the ring buffers while traceoff disables the ring buffers. You can pass in the number of times you want the command to be executed when the function is hit. It will only execute if the state of the buffers are not already in that state. Example: # echo do_fork:traceon:4 Will enable the ring buffers if they are disabled every time it hits do_fork, up to 4 times. # echo sys_close:traceoff This will disable the ring buffers every time (unlimited) when sys_close is called. Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 135 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 36bf9568ccd9..5c95708b9dc3 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -9,6 +9,7 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ +#include #include #include #include @@ -231,9 +232,143 @@ static struct tracer function_trace __read_mostly = #endif }; +#ifdef CONFIG_DYNAMIC_FTRACE +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + long *count = (long *)data; + + if (tracing_is_on()) + return; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + long *count = (long *)data; + + if (!tracing_is_on()) + return; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_off(); +} + +static struct ftrace_hook_ops traceon_hook_ops = { + .func = ftrace_traceon, +}; + +static struct ftrace_hook_ops traceoff_hook_ops = { + .func = ftrace_traceoff, +}; + +static int +ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +{ + struct ftrace_hook_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = &traceon_hook_ops; + else + ops = &traceoff_hook_ops; + + unregister_ftrace_function_hook_func(glob, ops); + + return 0; +} + +static int +ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_hook_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + if (glob[0] == '!') + return ftrace_trace_onoff_unreg(glob+1, cmd, param); + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = &traceon_hook_ops; + else + ops = &traceoff_hook_ops; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = strict_strtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_hook(glob, ops, count); + + return ret; +} + +static struct ftrace_func_command ftrace_traceon_cmd = { + .name = "traceon", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_traceoff_cmd = { + .name = "traceoff", + .func = ftrace_trace_onoff_callback, +}; + +static int __init init_func_cmd_traceon(void) +{ + int ret; + + ret = register_ftrace_command(&ftrace_traceoff_cmd); + if (ret) + return ret; + + ret = register_ftrace_command(&ftrace_traceon_cmd); + if (ret) + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; +} +#else +static inline int init_func_cmd_traceon(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + static __init int init_function_trace(void) { + init_func_cmd_traceon(); return register_tracer(&function_trace); } device_initcall(init_function_trace); + -- cgit v1.2.3-59-g8ed1b From 8fc0c701c5b6c0c3e242758c3acef6f9047940a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Feb 2009 15:28:00 -0500 Subject: ftrace: show selected functions in set_ftrace_filter This patch adds output to show what functions have tracer hooks attached to them. # echo 'sys_open:traceon:4' > /debug/tracing/set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:ftrace_traceon:0000000000000004 # echo 'do_fork:traceoff:' > set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:ftrace_traceon:0000000000000002 do_fork:ftrace_traceoff:ffffffffffffffff Note the 4 changed to a 2. This is because The code was executed twice since the traceoff was added. If a cat is done again: #### all functions enabled #### sys_open:ftrace_traceon do_fork:ftrace_traceoff:ffffffffffffffff The number disappears. That is because it will not print a NULL. Callbacks to allow the tracer to pretty print will be implemented soon. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 123 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 20 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0b80e325f296..1e058848cddb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,14 +45,14 @@ ftrace_kill(); \ } while (0) +/* hash bits for specific function selection */ +#define FTRACE_HASH_BITS 7 +#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - /* Quick disabling of function tracer. */ int function_trace_stop; @@ -248,6 +248,21 @@ static void ftrace_update_pid_func(void) # error Dynamic ftrace depends on MCOUNT_RECORD #endif +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_hook { + struct hlist_node node; + struct ftrace_hook_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct rcu_head rcu; +}; + + enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -750,12 +765,14 @@ enum { FTRACE_ITER_NOTRACE = (1 << 2), FTRACE_ITER_FAILURES = (1 << 3), FTRACE_ITER_PRINTALL = (1 << 4), + FTRACE_ITER_HASH = (1 << 5), }; #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { struct ftrace_page *pg; + int hidx; int idx; unsigned flags; unsigned char buffer[FTRACE_BUFF_MAX+1]; @@ -763,18 +780,87 @@ struct ftrace_iterator { unsigned filtered; }; +static void * +t_hash_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct hlist_node *hnd = v; + struct hlist_head *hhd; + + WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); + + (*pos)++; + + retry: + if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + return NULL; + + hhd = &ftrace_func_hash[iter->hidx]; + + if (hlist_empty(hhd)) { + iter->hidx++; + hnd = NULL; + goto retry; + } + + if (!hnd) + hnd = hhd->first; + else { + hnd = hnd->next; + if (!hnd) { + iter->hidx++; + goto retry; + } + } + + return hnd; +} + +static void *t_hash_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + + iter->flags |= FTRACE_ITER_HASH; + + return t_hash_next(m, p, pos); +} + +static int t_hash_show(struct seq_file *m, void *v) +{ + struct ftrace_func_hook *rec; + struct hlist_node *hnd = v; + char str[KSYM_SYMBOL_LEN]; + + rec = hlist_entry(hnd, struct ftrace_func_hook, node); + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, "%s:", str); + + kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); + seq_printf(m, "%s", str); + + if (rec->data) + seq_printf(m, ":%p", rec->data); + seq_putc(m, '\n'); + + return 0; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = NULL; + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_next(m, v, pos); + (*pos)++; if (iter->flags & FTRACE_ITER_PRINTALL) return NULL; - mutex_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -803,7 +889,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } - mutex_unlock(&ftrace_lock); return rec; } @@ -813,6 +898,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) struct ftrace_iterator *iter = m->private; void *p = NULL; + mutex_lock(&ftrace_lock); /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all @@ -820,12 +906,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) */ if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { if (*pos > 0) - return NULL; + return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; (*pos)++; return iter; } + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_start(m, pos); + if (*pos > 0) { if (iter->idx < 0) return p; @@ -835,11 +924,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) p = t_next(m, p, pos); + if (!p) + return t_hash_start(m, pos); + return p; } static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&ftrace_lock); } static int t_show(struct seq_file *m, void *v) @@ -848,6 +941,9 @@ static int t_show(struct seq_file *m, void *v) struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_show(m, v); + if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; @@ -1246,19 +1342,6 @@ static int __init ftrace_mod_cmd_init(void) } device_initcall(ftrace_mod_cmd_init); -#define FTRACE_HASH_BITS 7 -#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_hook { - struct hlist_node node; - struct ftrace_hook_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct rcu_head rcu; -}; - static void function_trace_hook_call(unsigned long ip, unsigned long parent_ip) { -- cgit v1.2.3-59-g8ed1b From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Feb 2009 23:06:01 -0500 Subject: ftrace: add pretty print to selected fuction traces This patch adds a call back for the tracers that have hooks to selected functions. This allows the tracer to show better output in the set_ftrace_filter file. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 6 ++++++ kernel/trace/ftrace.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 13918c4400ad..b331e216d8a1 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -106,12 +106,18 @@ struct ftrace_func_command { /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include +struct seq_file; + struct ftrace_hook_ops { void (*func)(unsigned long ip, unsigned long parent_ip, void **data); int (*callback)(unsigned long ip, void **data); void (*free)(void **data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_hook_ops *ops, + void *data); }; extern int diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e058848cddb..6533c1d20155 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v) rec = hlist_entry(hnd, struct ftrace_func_hook, node); + if (rec->ops->print) + return rec->ops->print(m, rec->ip, rec->ops, rec->data); + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, "%s:", str); -- cgit v1.2.3-59-g8ed1b From e110e3d1eaa0f9628918be67ddd32e8ad65a2871 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Feb 2009 23:38:13 -0500 Subject: ftrace: add pretty print function for traceon and traceoff hooks This patch adds a pretty print version of traceon and traceoff output for set_ftrace_filter. # echo 'sys_open:traceon:4' > set_ftrace_filter # cat set_ftrace_filter #### all functions enabled #### sys_open:traceon:count=4 Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5c95708b9dc3..f520aa419dff 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -267,14 +267,42 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) tracing_off(); } +static int +ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_hook_ops *ops, void *data); + static struct ftrace_hook_ops traceon_hook_ops = { .func = ftrace_traceon, + .print = ftrace_trace_onoff_print, }; static struct ftrace_hook_ops traceoff_hook_ops = { .func = ftrace_traceoff, + .print = ftrace_trace_onoff_print, }; +static int +ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_hook_ops *ops, void *data) +{ + char str[KSYM_SYMBOL_LEN]; + long count = (long)data; + + kallsyms_lookup(ip, NULL, NULL, NULL, str); + seq_printf(m, "%s:", str); + + if (ops == &traceon_hook_ops) + seq_printf(m, "traceon"); + else + seq_printf(m, "traceoff"); + + if (count != -1) + seq_printf(m, ":count=%ld", count); + seq_putc(m, '\n'); + + return 0; +} + static int ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) { -- cgit v1.2.3-59-g8ed1b From ec32816f94a0baf90f5e73033dcdbc8679c7f91d Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 13 Feb 2009 09:13:11 +0100 Subject: UBIFS: list usage cleanup Trivial cleanup, list_del(); list_add{,_tail}() is equivalent to list_move{,_tail}(). Semantic patch for coccinelle can be found at www.cccmz.de/~snakebyte/list_move_tail.spatch Signed-off-by: Eric Sesterhenn Signed-off-by: Artem Bityutskiy --- fs/ubifs/log.c | 3 +-- fs/ubifs/shrinker.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 3e0aa7367556..1004261dc864 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c) bud->jhead, c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); - list_del(&bud->list); /* * If the commit does not finish, the recovery will need * to replay the journal, in which case the old buds @@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c) * commit i.e. do not allow them to be garbage * collected. */ - list_add(&bud->list, &c->old_buds); + list_move(&bud->list, &c->old_buds); } } spin_unlock(&c->buds_lock); diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index e7bab52a1410..02feb59cefca 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention) * Move this one to the end of the list to provide some * fairness. */ - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; @@ -263,8 +262,7 @@ static int kick_a_thread(void) } if (i == 1) { - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); -- cgit v1.2.3-59-g8ed1b From 97d0bb8dcd8c2812e1927cdb51d7b1f9c98352b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 11:47:39 +0100 Subject: ftrace: fix !CONFIG_FTRACE [un_]register_ftrace_command() prototypes Impact: build fix Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b331e216d8a1..63281228ce3e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -238,9 +238,11 @@ extern void ftrace_enable_daemon(void); static inline void ftrace_release(void *start, unsigned long size) { } static inline int register_ftrace_command(struct ftrace_func_command *cmd) { + return -EINVAL; } static inline int unregister_ftrace_command(char *cmd_name) { + return -EINVAL; } #endif /* CONFIG_DYNAMIC_FTRACE */ -- cgit v1.2.3-59-g8ed1b From 73d3fd96e77745742f3750b7b19ee42204adc210 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Feb 2009 11:48:18 +0100 Subject: ftrace: fix !CONFIG_DYNAMIC_FTRACE ftrace_swapper_pid definition Impact: build fix Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6533c1d20155..4e6c87ecf1bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -243,14 +243,16 @@ static void ftrace_update_pid_func(void) mutex_unlock(&ftrace_lock); } +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + #ifdef CONFIG_DYNAMIC_FTRACE + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; -static struct pid * const ftrace_swapper_pid = &init_struct_pid; static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; struct ftrace_func_hook { -- cgit v1.2.3-59-g8ed1b From 6a24a244cd3a02d5b290293c32fcf2c6e92b4235 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 11:20:26 -0500 Subject: ftrace: clean up coding style Ingo Molnar pointed out some coding style issues with the recent ftrace updates. This patch cleans them up. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 24 +++++++++++++----------- kernel/trace/trace_functions.c | 1 - 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4e6c87ecf1bf..af9d95c0e4de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -460,8 +460,8 @@ static void ftrace_bug(int failed, unsigned long ip) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { - unsigned long ip, fl; unsigned long ftrace_addr; + unsigned long ip, fl; ftrace_addr = (unsigned long)FTRACE_ADDR; @@ -530,9 +530,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) static void ftrace_replace_code(int enable) { - int failed; struct dyn_ftrace *rec; struct ftrace_page *pg; + int failed; do_for_each_ftrace_rec(pg, rec) { /* @@ -1208,14 +1208,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) static void ftrace_match_records(char *buff, int len, int enable) { - char *search; + unsigned int search_len; struct ftrace_page *pg; struct dyn_ftrace *rec; + unsigned long flag; + char *search; int type; - unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned search_len; int not; + flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; type = ftrace_setup_glob(buff, len, &search, ¬); search_len = strlen(search); @@ -1263,14 +1264,16 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, static void ftrace_match_module_records(char *buff, char *mod, int enable) { - char *search = buff; + unsigned search_len = 0; struct ftrace_page *pg; struct dyn_ftrace *rec; int type = MATCH_FULL; - unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; - unsigned search_len = 0; + char *search = buff; + unsigned long flag; int not = 0; + flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; + /* blank or '*' mean the same */ if (strcmp(buff, "*") == 0) buff[0] = 0; @@ -1442,8 +1445,8 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, struct ftrace_func_hook *entry; struct ftrace_page *pg; struct dyn_ftrace *rec; - unsigned long key; int type, len, not; + unsigned long key; int count = 0; char *search; @@ -1623,8 +1626,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) static int ftrace_process_regex(char *buff, int len, int enable) { - struct ftrace_func_command *p; char *func, *command, *next = buff; + struct ftrace_func_command *p; int ret = -EINVAL; func = strsep(&next, ":"); @@ -2392,7 +2395,6 @@ static __init int ftrace_init_debugfs(void) "'set_ftrace_pid' entry\n"); return 0; } - fs_initcall(ftrace_init_debugfs); /** diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f520aa419dff..021a574c5988 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -397,6 +397,5 @@ static __init int init_function_trace(void) init_func_cmd_traceon(); return register_tracer(&function_trace); } - device_initcall(init_function_trace); -- cgit v1.2.3-59-g8ed1b From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 12:32:04 -0500 Subject: ftrace: rename _hook to _probe Impact: clean up Ingo Molnar did not like the _hook naming convention used by the select function tracer. Luis Claudio R. Goncalves suggested using the "_probe" extension. This patch implements the change of calling the functions and variables "_hook" and replacing them with "_probe". Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 12 +++---- kernel/trace/ftrace.c | 78 +++++++++++++++++++++--------------------- kernel/trace/trace_functions.c | 26 +++++++------- 3 files changed, 58 insertions(+), 58 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 63281228ce3e..9d224c43e634 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -108,7 +108,7 @@ struct ftrace_func_command { struct seq_file; -struct ftrace_hook_ops { +struct ftrace_probe_ops { void (*func)(unsigned long ip, unsigned long parent_ip, void **data); @@ -116,19 +116,19 @@ struct ftrace_hook_ops { void (*free)(void **data); int (*print)(struct seq_file *m, unsigned long ip, - struct ftrace_hook_ops *ops, + struct ftrace_probe_ops *ops, void *data); }; extern int -register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); extern void -unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data); extern void -unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops); -extern void unregister_ftrace_function_hook_all(char *glob); +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); +extern void unregister_ftrace_function_probe_all(char *glob); enum { FTRACE_FL_FREE = (1 << 0), diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index af9d95c0e4de..330a059f6ed7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; -struct ftrace_func_hook { +struct ftrace_func_probe { struct hlist_node node; - struct ftrace_hook_ops *ops; + struct ftrace_probe_ops *ops; unsigned long flags; unsigned long ip; void *data; @@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) static int t_hash_show(struct seq_file *m, void *v) { - struct ftrace_func_hook *rec; + struct ftrace_func_probe *rec; struct hlist_node *hnd = v; char str[KSYM_SYMBOL_LEN]; - rec = hlist_entry(hnd, struct ftrace_func_hook, node); + rec = hlist_entry(hnd, struct ftrace_func_probe, node); if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void) device_initcall(ftrace_mod_cmd_init); static void -function_trace_hook_call(unsigned long ip, unsigned long parent_ip) +function_trace_probe_call(unsigned long ip, unsigned long parent_ip) { - struct ftrace_func_hook *entry; + struct ftrace_func_probe *entry; struct hlist_head *hhd; struct hlist_node *n; unsigned long key; @@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip) ftrace_preempt_enable(resched); } -static struct ftrace_ops trace_hook_ops __read_mostly = +static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_hook_call, + .func = function_trace_probe_call, }; -static int ftrace_hook_registered; +static int ftrace_probe_registered; -static void __enable_ftrace_function_hook(void) +static void __enable_ftrace_function_probe(void) { int i; - if (ftrace_hook_registered) + if (ftrace_probe_registered) return; for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { @@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void) if (i == FTRACE_FUNC_HASHSIZE) return; - __register_ftrace_function(&trace_hook_ops); + __register_ftrace_function(&trace_probe_ops); ftrace_startup(0); - ftrace_hook_registered = 1; + ftrace_probe_registered = 1; } -static void __disable_ftrace_function_hook(void) +static void __disable_ftrace_function_probe(void) { int i; - if (!ftrace_hook_registered) + if (!ftrace_probe_registered) return; for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { @@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void) } /* no more funcs left */ - __unregister_ftrace_function(&trace_hook_ops); + __unregister_ftrace_function(&trace_probe_ops); ftrace_shutdown(0); - ftrace_hook_registered = 0; + ftrace_probe_registered = 0; } static void ftrace_free_entry_rcu(struct rcu_head *rhp) { - struct ftrace_func_hook *entry = - container_of(rhp, struct ftrace_func_hook, rcu); + struct ftrace_func_probe *entry = + container_of(rhp, struct ftrace_func_probe, rcu); if (entry->ops->free) entry->ops->free(&entry->data); @@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp) int -register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { - struct ftrace_func_hook *entry; + struct ftrace_func_probe *entry; struct ftrace_page *pg; struct dyn_ftrace *rec; int type, len, not; @@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); len = strlen(search); - /* we do not support '!' for function hooks */ + /* we do not support '!' for function probes */ if (WARN_ON(not)) return -EINVAL; @@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { - /* If we did not hook to any, then return error */ + /* If we did not process any, then return error */ if (!count) count = -ENOMEM; goto out_unlock; @@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); } while_for_each_ftrace_rec(); - __enable_ftrace_function_hook(); + __enable_ftrace_function_probe(); out_unlock: mutex_unlock(&ftrace_lock); @@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, } enum { - HOOK_TEST_FUNC = 1, - HOOK_TEST_DATA = 2 + PROBE_TEST_FUNC = 1, + PROBE_TEST_DATA = 2 }; static void -__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, +__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { - struct ftrace_func_hook *entry; + struct ftrace_func_probe *entry; struct hlist_node *n, *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; @@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); len = strlen(search); - /* we do not support '!' for function hooks */ + /* we do not support '!' for function probes */ if (WARN_ON(not)) return; } @@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { /* break up if statements for readability */ - if ((flags & HOOK_TEST_FUNC) && entry->ops != ops) + if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) continue; - if ((flags & HOOK_TEST_DATA) && entry->data != data) + if ((flags & PROBE_TEST_DATA) && entry->data != data) continue; /* do this last, since it is the most expensive */ @@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, call_rcu(&entry->rcu, ftrace_free_entry_rcu); } } - __disable_ftrace_function_hook(); + __disable_ftrace_function_probe(); mutex_unlock(&ftrace_lock); } void -unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops, +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { - __unregister_ftrace_function_hook(glob, ops, data, - HOOK_TEST_FUNC | HOOK_TEST_DATA); + __unregister_ftrace_function_probe(glob, ops, data, + PROBE_TEST_FUNC | PROBE_TEST_DATA); } void -unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops) +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) { - __unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC); + __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); } -void unregister_ftrace_function_hook_all(char *glob) +void unregister_ftrace_function_probe_all(char *glob) { - __unregister_ftrace_function_hook(glob, NULL, NULL, 0); + __unregister_ftrace_function_probe(glob, NULL, NULL, 0); } static LIST_HEAD(ftrace_commands); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 021a574c5988..6ea73ed03bfa 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_hook_ops *ops, void *data); + struct ftrace_probe_ops *ops, void *data); -static struct ftrace_hook_ops traceon_hook_ops = { +static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_trace_onoff_print, }; -static struct ftrace_hook_ops traceoff_hook_ops = { +static struct ftrace_probe_ops traceoff_probe_ops = { .func = ftrace_traceoff, .print = ftrace_trace_onoff_print, }; static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_hook_ops *ops, void *data) + struct ftrace_probe_ops *ops, void *data) { char str[KSYM_SYMBOL_LEN]; long count = (long)data; @@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, kallsyms_lookup(ip, NULL, NULL, NULL, str); seq_printf(m, "%s:", str); - if (ops == &traceon_hook_ops) + if (ops == &traceon_probe_ops) seq_printf(m, "traceon"); else seq_printf(m, "traceoff"); @@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, static int ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) { - struct ftrace_hook_ops *ops; + struct ftrace_probe_ops *ops; /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) - ops = &traceon_hook_ops; + ops = &traceon_probe_ops; else - ops = &traceoff_hook_ops; + ops = &traceoff_probe_ops; - unregister_ftrace_function_hook_func(glob, ops); + unregister_ftrace_function_probe_func(glob, ops); return 0; } @@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) static int ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) { - struct ftrace_hook_ops *ops; + struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; @@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) - ops = &traceon_hook_ops; + ops = &traceon_probe_ops; else - ops = &traceoff_hook_ops; + ops = &traceoff_probe_ops; if (!param) goto out_reg; @@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) return ret; out_reg: - ret = register_ftrace_function_hook(glob, ops, count); + ret = register_ftrace_function_probe(glob, ops, count); return ret; } -- cgit v1.2.3-59-g8ed1b From af513098452b8887d7c0e15a39d7cb74479501bd Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 17 Feb 2009 01:07:28 -0500 Subject: tracing: use the more proper parameter Pass tsk to tracing_record_cmdline instead of current. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 95f99a7abf2f..dc61e82faad9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -336,7 +336,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) data->rt_priority = tsk->rt_priority; /* record this tasks comm */ - tracing_record_cmdline(current); + tracing_record_cmdline(tsk); } static void -- cgit v1.2.3-59-g8ed1b From d2ef7c2f0f9ab48c25eafc0ebad0df5f7930420b Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 17 Feb 2009 01:09:47 -0500 Subject: tracing: fix the return value of trace selftest This patch is to fix the return value of trace_selftest_startup_sysprof and trace_selftest_startup_branch on failure. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0c9aa1457e51..c72e749bcbef 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -622,7 +622,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); - return 0; + return ret; } /* Sleep for a 1/10 of a second */ @@ -634,6 +634,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + return ret; } #endif /* CONFIG_SYSPROF_TRACER */ @@ -661,6 +666,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) trace->reset(tr); tracing_start(); + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + return ret; } #endif /* CONFIG_BRANCH_TRACER */ -- cgit v1.2.3-59-g8ed1b From 73d8b8bc4f24a97a406d09c8268ac019f4ac661e Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Tue, 17 Feb 2009 01:10:02 -0500 Subject: tracing: fix typing mistake in hint message and comments Impact: cleanup Fix incorrect hint message in code and typos in comments. Signed-off-by: Wenji Huang Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 4 ++-- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_sysprof.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c6b442d88de8..9e5ebd844158 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -1,5 +1,5 @@ /* - * trace irqs off criticall timings + * trace irqs off critical timings * * Copyright (C) 2007-2008 Steven Rostedt * Copyright (C) 2008 Ingo Molnar diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 30e14fe85896..82fbb5a2df89 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -93,7 +93,7 @@ static int tracing_sched_register(void) ret = register_trace_sched_switch(probe_sched_switch); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" - " probe to kernel_sched_schedule\n"); + " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 96d716485898..276c51aaf314 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -284,7 +284,7 @@ static void start_wakeup_tracer(struct trace_array *tr) ret = register_trace_sched_switch(probe_wakeup_sched_switch); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" - " probe to kernel_sched_schedule\n"); + " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index c72e749bcbef..01415f4edaa5 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,9 +107,9 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, func(); /* - * Some archs *cough*PowerPC*cough* add charachters to the + * Some archs *cough*PowerPC*cough* add characters to the * start of the function names. We simply put a '*' to - * accomodate them. + * accommodate them. */ func_name = "*" STR(DYN_FTRACE_TEST_NAME); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index eae9cef39291..39310e3434ee 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -30,7 +30,7 @@ struct tracer_stat_session { struct dentry *file; }; -/* All of the sessions currently in use. Each stat file embeed one session */ +/* All of the sessions currently in use. Each stat file embed one session */ static LIST_HEAD(all_stat_sessions); static DEFINE_MUTEX(all_stat_sessions_mutex); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 7c9a2d82a7d8..c771af4e8f1a 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -327,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer) d_tracer, NULL, &sysprof_sample_fops); if (entry) return; - pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); + pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } -- cgit v1.2.3-59-g8ed1b From 35ebf1caa4854ad5ba25f3a72967acc064147994 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 13:12:12 -0500 Subject: ftrace: show unlimited when traceon or traceoff has no counter Impact: clean up The traceon and traceoff function probes are confusing to developers to what happens when a counter is not specified. This should help clear things up. # echo "*:traceoff" > set_ftrace_filter # cat /debug/tracing/set_ftrace_filter #### all functions enabled #### do_fork:traceoff:unlimited Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 6ea73ed03bfa..4c113a8c466f 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -296,7 +296,9 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, else seq_printf(m, "traceoff"); - if (count != -1) + if (count == -1) + seq_printf(m, ":unlimited\n"); + else seq_printf(m, ":count=%ld", count); seq_putc(m, '\n'); -- cgit v1.2.3-59-g8ed1b From 6eaaa5d57e76c454479833fc8594cd7c3b75c789 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Feb 2009 02:25:00 +0100 Subject: tracing/core: use appropriate waiting on trace_pipe Impact: api and pipe waiting change Currently, the waiting used in tracing_read_pipe() is done through a 100 msecs schedule_timeout() loop which periodically check if there are traces on the buffer. This can cause small latencies for programs which are reading the incoming events. This patch makes the reader waiting for the trace_wait waitqueue except for few tracers such as the sched and functions tracers which might be already hold the runqueue lock while waking up the reader. This is performed through a new callback wait_pipe() on struct tracer. If none is implemented on a specific tracer, the default waiting for trace_wait queue is attached. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 62 ++++++++++++++++++++++++------------ kernel/trace/trace.h | 25 +++++++++++++-- kernel/trace/trace_functions.c | 1 + kernel/trace/trace_functions_graph.c | 1 + kernel/trace/trace_sched_switch.c | 1 + kernel/trace/trace_sched_wakeup.c | 1 + 6 files changed, 67 insertions(+), 24 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc61e82faad9..881a94474d79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -499,6 +499,9 @@ __acquires(kernel_lock) else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + if (!type->wait_pipe) + type->wait_pipe = default_wait_pipe; + #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest && !tracing_selftest_disabled) { @@ -1064,7 +1067,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_prio = wakee->prio; entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - trace_buffer_unlock_commit(tr, event, flags, pc); + + ring_buffer_unlock_commit(tr->buffer, event); + ftrace_trace_stack(tr, flags, 6, pc); + ftrace_trace_userstack(tr, flags, pc); } void @@ -2392,6 +2398,38 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } } + +void default_wait_pipe(struct trace_iterator *iter) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + + if (trace_empty(iter)) + schedule(); + + finish_wait(&trace_wait, &wait); +} + +/* + * This is a make-shift waitqueue. + * A tracer might use this callback on some rare cases: + * + * 1) the current tracer might hold the runqueue lock when it wakes up + * a reader, hence a deadlock (sched, function, and function graph tracers) + * 2) the function tracers, trace all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * + * Anyway, this is really very primitive wakeup. + */ +void poll_wait_pipe(struct trace_iterator *iter) +{ + set_current_state(TASK_INTERRUPTIBLE); + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ / 10); +} + /* Must be called with trace_types_lock mutex held. */ static int tracing_wait_pipe(struct file *filp) { @@ -2403,30 +2441,14 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - /* - * This is a make-shift waitqueue. The reason we don't use - * an actual wait queue is because: - * 1) we only ever have one waiter - * 2) the tracing, traces all functions, we don't want - * the overhead of calling wake_up and friends - * (and tracing them too) - * Anyway, this is really very primitive wakeup. - */ - set_current_state(TASK_INTERRUPTIBLE); - iter->tr->waiter = current; - mutex_unlock(&trace_types_lock); - /* sleep for 100 msecs, and try again. */ - schedule_timeout(HZ/10); + iter->trace->wait_pipe(iter); mutex_lock(&trace_types_lock); - iter->tr->waiter = NULL; - - if (signal_pending(current)) { + if (signal_pending(current)) return -EINTR; - } if (iter->trace != current_trace) return 0; @@ -2442,8 +2464,6 @@ static int tracing_wait_pipe(struct file *filp) */ if (!tracer_enabled && iter->pos) break; - - continue; } return 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dbff0207b213..eed732c151fc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -337,18 +337,34 @@ struct tracer_flags { #define TRACER_OPT(s, b) .name = #s, .bit = b -/* - * A specific tracer, represented by methods that operate on a trace array: +/** + * struct tracer - a specific tracer and its callbacks to interact with debugfs + * @name: the name chosen to select it on the available_tracers file + * @init: called when one switches to this tracer (echo name > current_tracer) + * @reset: called when one switches to another tracer + * @start: called when tracing is unpaused (echo 1 > tracing_enabled) + * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @open: called when the trace file is opened + * @pipe_open: called when the trace_pipe file is opened + * @wait_pipe: override how the user waits for traces on trace_pipe + * @close: called when the trace file is released + * @read: override the default read callback on trace_pipe + * @splice_read: override the default splice_read callback on trace_pipe + * @selftest: selftest to run on boot (see trace_selftest.c) + * @print_headers: override the first lines that describe your columns + * @print_line: callback that prints a trace + * @set_flag: signals one of your private flags changed (trace_options file) + * @flags: your private flags */ struct tracer { const char *name; - /* Your tracer should raise a warning if init fails */ int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); + void (*wait_pipe)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, @@ -432,6 +448,9 @@ void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); +void default_wait_pipe(struct trace_iterator *iter); +void poll_wait_pipe(struct trace_iterator *iter); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 4c113a8c466f..c9a0b7df44ff 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -225,6 +225,7 @@ static struct tracer function_trace __read_mostly = .init = function_trace_init, .reset = function_trace_reset, .start = function_trace_start, + .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, #ifdef CONFIG_FTRACE_SELFTEST diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 519a0cab1530..0ff5cb661900 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -757,6 +757,7 @@ static struct tracer graph_trace __read_mostly = { .name = "function_graph", .open = graph_trace_open, .close = graph_trace_close, + .wait_pipe = poll_wait_pipe, .init = graph_trace_init, .reset = graph_trace_reset, .print_line = print_graph_function, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 82fbb5a2df89..77132c2cf3d9 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -221,6 +221,7 @@ static struct tracer sched_switch_trace __read_mostly = .reset = sched_switch_trace_reset, .start = sched_switch_trace_start, .stop = sched_switch_trace_stop, + .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_sched_switch, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 276c51aaf314..db55f7aaa640 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -380,6 +380,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .reset = wakeup_tracer_reset, .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, .print_max = 1, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, -- cgit v1.2.3-59-g8ed1b From fa7c7f6e11f70d62505074a8b30a776236850dec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Feb 2009 02:51:30 +0100 Subject: tracing/core: remove unused parameter in tracing_fill_pipe_page() Impact: cleanup The struct page *pages parameter is unused. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 881a94474d79..e1f3b99a2e52 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2571,8 +2571,7 @@ static struct pipe_buf_operations tracing_pipe_buf_ops = { }; static size_t -tracing_fill_pipe_page(struct page *pages, size_t rem, - struct trace_iterator *iter) +tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; int ret; @@ -2649,7 +2648,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (!pages[i]) break; - rem = tracing_fill_pipe_page(pages[i], rem, iter); + rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, -- cgit v1.2.3-59-g8ed1b From cf7dab801796b9ee52a6dc99888a66bf476538ec Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Wed, 18 Feb 2009 15:41:28 -0600 Subject: Revert "[XFS] use scalable vmap API" This reverts commit 95f8e302c04c0b0c6de35ab399a5551605eeb006. This commit caused regression. We'll try to fix use of new vmap API for next release. Signed-off-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d71dc44e21ed..0b2177a9fbdc 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -264,7 +264,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); + vunmap(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,8 +386,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + bp->b_addr = vmap(bp->b_pages, bp->b_page_count, + VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; -- cgit v1.2.3-59-g8ed1b From 3a011a171906a3a51a43bb860fb7c66a64cab140 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Wed, 18 Feb 2009 15:56:51 -0600 Subject: Revert "[XFS] remove old vmap cache" This reverts commit d2859751cd0bf586941ffa7308635a293f943c17. This commit caused regression. We'll try to fix use of new vmap API for next release. Signed-off-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 75 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 0b2177a9fbdc..cb329edc925b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -165,6 +165,75 @@ test_page_region( return (mask && (page_private(page) & mask) == mask); } +/* + * Mapping of multi-page buffers into contiguous virtual space + */ + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +static a_list_t *as_free_head; +static int as_list_len; +static DEFINE_SPINLOCK(as_lock); + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + +#ifdef CONFIG_XEN + /* + * Xen needs to be able to make sure it can get an exclusive + * RO mapping of pages it wants to turn into a pagetable. If + * a newly allocated page is also still being vmap()ed by xfs, + * it will cause pagetable construction to fail. This is a + * quick workaround to always eagerly unmap pages so that Xen + * is happy. + */ + vunmap(addr); + return; +#endif + + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); + if (likely(aentry)) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + /* * Internal xfs_buf_t object manipulation */ @@ -264,7 +333,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vunmap(bp->b_addr - bp->b_offset); + free_address(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,6 +455,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); bp->b_addr = vmap(bp->b_pages, bp->b_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) @@ -1672,6 +1743,8 @@ xfsbufd( count++; } + if (as_list_len > 0) + purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); -- cgit v1.2.3-59-g8ed1b From d1f9cbd78841f1a797c77e9117e4882f932c2ef6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 18 Feb 2009 04:25:25 +0100 Subject: tracing/function-graph-tracer: fix traces weirdness while absolute time printing Impact: trace output cleanup/reordering When an interrupt occurs and and the abstime option is selected: echo funcgraph-abstime > /debug/tracing/trace_options then we observe broken traces: 30581.025422 | 0) Xorg-4291 | 0.503 us | idle_cpu(); 30581.025424 | 0) Xorg-4291 | 2.576 us | } 30581.025424 | 0) Xorg-4291 | + 75.771 us | } 0) Xorg-4291 | <========== | 30581.025425 | 0) Xorg-4291 | | schedule() { 30581.025426 | 0) Xorg-4291 | | __schedule() { 30581.025426 | 0) Xorg-4291 | 0.705 us | _spin_lock_irq(); With this patch, the interrupts output better adapts to absolute time printing: 414.856543 | 1) Xorg-4279 | 8.816 us | } 414.856544 | 1) Xorg-4279 | 0.525 us | rcu_irq_exit(); 414.856545 | 1) Xorg-4279 | 0.526 us | idle_cpu(); 414.856546 | 1) Xorg-4279 | + 12.157 us | } 414.856549 | 1) Xorg-4279 | ! 104.114 us | } 414.856549 | 1) Xorg-4279 | <========== | 414.856549 | 1) Xorg-4279 | ! 107.944 us | } 414.856550 | 1) Xorg-4279 | ! 137.010 us | } 414.856551 | 1) Xorg-4279 | 0.624 us | _read_unlock(); 414.856552 | 1) Xorg-4279 | ! 140.930 us | } 414.856552 | 1) Xorg-4279 | ! 166.159 us | } Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 50 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6c7738e4f98b..8f4004a00b4e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -351,16 +351,35 @@ print_graph_overhead(unsigned long long duration, struct trace_seq *s) return trace_seq_printf(s, " "); } +static int print_graph_abs_time(u64 t, struct trace_seq *s) +{ + unsigned long usecs_rem; + + usecs_rem = do_div(t, NSEC_PER_SEC); + usecs_rem /= 1000; + + return trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); +} + static enum print_line_t -print_graph_irq(struct trace_seq *s, unsigned long addr, +print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid) { int ret; + struct trace_seq *s = &iter->seq; if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) return TRACE_TYPE_UNHANDLED; + /* Absolute time */ + if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); @@ -446,17 +465,6 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) } -static int print_graph_abs_time(u64 t, struct trace_seq *s) -{ - unsigned long usecs_rem; - - usecs_rem = do_div(t, 1000000000); - usecs_rem /= 1000; - - return trace_seq_printf(s, "%5lu.%06lu | ", - (unsigned long)t, usecs_rem); -} - /* Case of a leaf function on its call entry */ static enum print_line_t print_graph_entry_leaf(struct trace_iterator *iter, @@ -561,7 +569,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; /* Interrupt */ - ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); + ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -581,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); + ret = print_graph_proc(s, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -605,11 +613,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int i; int ret; int cpu = iter->cpu; - pid_t *last_pid = iter->private; + pid_t *last_pid = iter->private, pid = ent->pid; unsigned long long duration = trace->rettime - trace->calltime; /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; /* Absolute time */ @@ -668,7 +676,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); + ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; @@ -684,6 +692,10 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, int cpu = iter->cpu; pid_t *last_pid = iter->private; + /* Pid */ + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + /* Absolute time */ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { ret = print_graph_abs_time(iter->ts, s); @@ -691,10 +703,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - /* Cpu */ if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { ret = print_graph_cpu(s, cpu); -- cgit v1.2.3-59-g8ed1b From 985ec20ad531f2641ab9d5193e37891fe959fc7d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 18 Feb 2009 06:35:34 +0100 Subject: tracing/function-graph-tracer: provide documentation for the function graph tracer Update documentation for the function graph tracer. Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 226 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 758fb42a1b68..055bcd2992da 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -129,6 +129,10 @@ of ftrace. Here is a list of some of the key files: set_ftrace_pid: Have the function tracer only trace a single thread. + set_graph_function: Select the function where the trace have to start + with the function graph tracer (See the section + "dynamic ftrace" for more details). + available_filter_functions: This lists the functions that ftrace has processed and can trace. These are the function names that you can pass to "set_ftrace_filter" or @@ -143,6 +147,12 @@ Here is the list of current tracers that may be configured. function - function tracer that uses mcount to trace all functions. + function_graph_tracer - similar to the function tracer except that the + function tracer probes the functions on their entry whereas the + function graph tracer traces on both entry and exit of the + functions. It then provides the ability to draw a graph of + function calls like a primitive C code source. + sched_switch - traces the context switches between tasks. irqsoff - traces the areas that disable interrupts and saves @@ -1226,6 +1236,163 @@ kernel module: [...] +function graph tracer +--------------------------- + +This tracer is similar to the function tracer except that it probes +a function on its entry and its exit. +This is done by setting a dynamically allocated stack of return addresses on each +task_struct. Then the tracer overwrites the return address of each function traced +to set a custom probe. Thus the original return address is stored on the stack of return +address in the task_struct. + +Probing on both extremities of a function leads to special features such as + +_ measure of function's time execution +_ having a reliable call stack to draw function calls graph + +This tracer is useful in several situations: + +_ you want to find the reason of a strange kernel behavior and need to see + what happens in detail on any areas (or specific ones). +_ you are experiencing weird latencies but it's difficult to find its origin. +_ you want to find quickly which path is taken by a specific function +_ you just want to see what happens inside your kernel + +# tracer: function_graph +# +# CPU DURATION FUNCTION CALLS +# | | | | | | | + + 0) | sys_open() { + 0) | do_sys_open() { + 0) | getname() { + 0) | kmem_cache_alloc() { + 0) 1.382 us | __might_sleep(); + 0) 2.478 us | } + 0) | strncpy_from_user() { + 0) | might_fault() { + 0) 1.389 us | __might_sleep(); + 0) 2.553 us | } + 0) 3.807 us | } + 0) 7.876 us | } + 0) | alloc_fd() { + 0) 0.668 us | _spin_lock(); + 0) 0.570 us | expand_files(); + 0) 0.586 us | _spin_unlock(); + + +There are several columns that can be dynamically enabled/disabled. +You can use every combination of options you want, depending on your needs. + +_ The cpu number on which the function executed is default enabled. + It is sometimes better to only trace one cpu (see tracing_cpu_mask file) + or you might sometimes see unordered function calls while cpu tracing switch. + + hide: echo nofuncgraph-cpu > /debug/tracing/trace_options + show: echo funcgraph-cpu > /debug/tracing/trace_options + +_ The duration (function's time of execution) is displayed on the closing bracket + line of a function or on the same line than the current function in case of a leaf + one. It is default enabled. + + hide: echo nofuncgraph-duration > /debug/tracing/trace_options + show: echo funcgraph-duration > /debug/tracing/trace_options + +_ The overhead field precedes the duration one in case of reached duration thresholds. + + hide: echo nofuncgraph-overhead > /debug/tracing/trace_options + show: echo funcgraph-overhead > /debug/tracing/trace_options + depends on: funcgraph-duration + + ie: + + 0) | up_write() { + 0) 0.646 us | _spin_lock_irqsave(); + 0) 0.684 us | _spin_unlock_irqrestore(); + 0) 3.123 us | } + 0) 0.548 us | fput(); + 0) + 58.628 us | } + + [...] + + 0) | putname() { + 0) | kmem_cache_free() { + 0) 0.518 us | __phys_addr(); + 0) 1.757 us | } + 0) 2.861 us | } + 0) ! 115.305 us | } + 0) ! 116.402 us | } + + + means that the function exceeded 10 usecs. + ! means that the function exceeded 100 usecs. + + +_ The task/pid field displays the thread cmdline and pid which executed the function. + It is default disabled. + + hide: echo nofuncgraph-proc > /debug/tracing/trace_options + show: echo funcgraph-proc > /debug/tracing/trace_options + + ie: + + # tracer: function_graph + # + # CPU TASK/PID DURATION FUNCTION CALLS + # | | | | | | | | | + 0) sh-4802 | | d_free() { + 0) sh-4802 | | call_rcu() { + 0) sh-4802 | | __call_rcu() { + 0) sh-4802 | 0.616 us | rcu_process_gp_end(); + 0) sh-4802 | 0.586 us | check_for_new_grace_period(); + 0) sh-4802 | 2.899 us | } + 0) sh-4802 | 4.040 us | } + 0) sh-4802 | 5.151 us | } + 0) sh-4802 | + 49.370 us | } + + +_ The absolute time field is an absolute timestamp given by the clock since + it started. A snapshot of this time is given on each entry/exit of functions + + hide: echo nofuncgraph-abstime > /debug/tracing/trace_options + show: echo funcgraph-abstime > /debug/tracing/trace_options + + ie: + + # + # TIME CPU DURATION FUNCTION CALLS + # | | | | | | | | + 360.774522 | 1) 0.541 us | } + 360.774522 | 1) 4.663 us | } + 360.774523 | 1) 0.541 us | __wake_up_bit(); + 360.774524 | 1) 6.796 us | } + 360.774524 | 1) 7.952 us | } + 360.774525 | 1) 9.063 us | } + 360.774525 | 1) 0.615 us | journal_mark_dirty(); + 360.774527 | 1) 0.578 us | __brelse(); + 360.774528 | 1) | reiserfs_prepare_for_journal() { + 360.774528 | 1) | unlock_buffer() { + 360.774529 | 1) | wake_up_bit() { + 360.774529 | 1) | bit_waitqueue() { + 360.774530 | 1) 0.594 us | __phys_addr(); + + +You can put some comments on specific functions by using ftrace_printk() +For example, if you want to put a comment inside the __might_sleep() function, +you just have to include and call ftrace_printk() inside __might_sleep() + +ftrace_printk("I'm a comment!\n") + +will produce: + + 1) | __might_sleep() { + 1) | /* I'm a comment! */ + 1) 1.449 us | } + + +You might find other useful features for this tracer on the "dynamic ftrace" +section such as tracing only specific functions or tasks. + dynamic ftrace -------------- @@ -1427,6 +1594,65 @@ Produces: We can see that there's no more lock or preempt tracing. + +* Dynamic ftrace with the function graph tracer * + + +Although what has been explained above concerns both the function tracer and +the function_graph_tracer, the following concerns only the latter. + +If you want to trace only one function and all of its childs, you just have +to echo its name on set_graph_function: + +echo __do_fault > set_graph_function + +will produce the following: + + 0) | __do_fault() { + 0) | filemap_fault() { + 0) | find_lock_page() { + 0) 0.804 us | find_get_page(); + 0) | __might_sleep() { + 0) 1.329 us | } + 0) 3.904 us | } + 0) 4.979 us | } + 0) 0.653 us | _spin_lock(); + 0) 0.578 us | page_add_file_rmap(); + 0) 0.525 us | native_set_pte_at(); + 0) 0.585 us | _spin_unlock(); + 0) | unlock_page() { + 0) 0.541 us | page_waitqueue(); + 0) 0.639 us | __wake_up_bit(); + 0) 2.786 us | } + 0) + 14.237 us | } + 0) | __do_fault() { + 0) | filemap_fault() { + 0) | find_lock_page() { + 0) 0.698 us | find_get_page(); + 0) | __might_sleep() { + 0) 1.412 us | } + 0) 3.950 us | } + 0) 5.098 us | } + 0) 0.631 us | _spin_lock(); + 0) 0.571 us | page_add_file_rmap(); + 0) 0.526 us | native_set_pte_at(); + 0) 0.586 us | _spin_unlock(); + 0) | unlock_page() { + 0) 0.533 us | page_waitqueue(); + 0) 0.638 us | __wake_up_bit(); + 0) 2.793 us | } + 0) + 14.012 us | } + +You can also select several functions: + +echo sys_open > set_graph_function +echo sys_close >> set_graph_function + +Now if you want to go back to trace all functions + +echo > set_graph_function + + trace_pipe ---------- -- cgit v1.2.3-59-g8ed1b From 5752674e140db5bce08c6bc60021a9bc3b960800 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Feb 2009 12:54:10 +0100 Subject: Documentation/ftrace.txt: update - fix typos/grammos and clarify the text - prettify the document some more Cc: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 956 ++++++++++++++++++++++++++--------------------- 1 file changed, 530 insertions(+), 426 deletions(-) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 055bcd2992da..2041ee951c1a 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -15,31 +15,31 @@ Introduction Ftrace is an internal tracer designed to help out developers and designers of systems to find what is going on inside the kernel. -It can be used for debugging or analyzing latencies and performance -issues that take place outside of user-space. +It can be used for debugging or analyzing latencies and +performance issues that take place outside of user-space. Although ftrace is the function tracer, it also includes an -infrastructure that allows for other types of tracing. Some of the -tracers that are currently in ftrace include a tracer to trace -context switches, the time it takes for a high priority task to -run after it was woken up, the time interrupts are disabled, and -more (ftrace allows for tracer plugins, which means that the list of -tracers can always grow). +infrastructure that allows for other types of tracing. Some of +the tracers that are currently in ftrace include a tracer to +trace context switches, the time it takes for a high priority +task to run after it was woken up, the time interrupts are +disabled, and more (ftrace allows for tracer plugins, which +means that the list of tracers can always grow). The File System --------------- -Ftrace uses the debugfs file system to hold the control files as well -as the files to display output. +Ftrace uses the debugfs file system to hold the control files as +well as the files to display output. To mount the debugfs system: # mkdir /debug # mount -t debugfs nodev /debug -(Note: it is more common to mount at /sys/kernel/debug, but for simplicity - this document will use /debug) +( Note: it is more common to mount at /sys/kernel/debug, but for + simplicity this document will use /debug) That's it! (assuming that you have ftrace configured into your kernel) @@ -50,94 +50,124 @@ of ftrace. Here is a list of some of the key files: Note: all time values are in microseconds. - current_tracer: This is used to set or display the current tracer - that is configured. - - available_tracers: This holds the different types of tracers that - have been compiled into the kernel. The tracers - listed here can be configured by echoing their name - into current_tracer. - - tracing_enabled: This sets or displays whether the current_tracer - is activated and tracing or not. Echo 0 into this - file to disable the tracer or 1 to enable it. - - trace: This file holds the output of the trace in a human readable - format (described below). - - latency_trace: This file shows the same trace but the information - is organized more to display possible latencies - in the system (described below). - - trace_pipe: The output is the same as the "trace" file but this - file is meant to be streamed with live tracing. - Reads from this file will block until new data - is retrieved. Unlike the "trace" and "latency_trace" - files, this file is a consumer. This means reading - from this file causes sequential reads to display - more current data. Once data is read from this - file, it is consumed, and will not be read - again with a sequential read. The "trace" and - "latency_trace" files are static, and if the - tracer is not adding more data, they will display - the same information every time they are read. - - trace_options: This file lets the user control the amount of data - that is displayed in one of the above output - files. - - trace_max_latency: Some of the tracers record the max latency. - For example, the time interrupts are disabled. - This time is saved in this file. The max trace - will also be stored, and displayed by either - "trace" or "latency_trace". A new max trace will - only be recorded if the latency is greater than - the value in this file. (in microseconds) - - buffer_size_kb: This sets or displays the number of kilobytes each CPU - buffer can hold. The tracer buffers are the same size - for each CPU. The displayed number is the size of the - CPU buffer and not total size of all buffers. The - trace buffers are allocated in pages (blocks of memory - that the kernel uses for allocation, usually 4 KB in size). - If the last page allocated has room for more bytes - than requested, the rest of the page will be used, - making the actual allocation bigger than requested. - (Note, the size may not be a multiple of the page size due - to buffer managment overhead.) - - This can only be updated when the current_tracer - is set to "nop". - - tracing_cpumask: This is a mask that lets the user only trace - on specified CPUS. The format is a hex string - representing the CPUS. - - set_ftrace_filter: When dynamic ftrace is configured in (see the - section below "dynamic ftrace"), the code is dynamically - modified (code text rewrite) to disable calling of the - function profiler (mcount). This lets tracing be configured - in with practically no overhead in performance. This also - has a side effect of enabling or disabling specific functions - to be traced. Echoing names of functions into this file - will limit the trace to only those functions. - - set_ftrace_notrace: This has an effect opposite to that of - set_ftrace_filter. Any function that is added here will not - be traced. If a function exists in both set_ftrace_filter - and set_ftrace_notrace, the function will _not_ be traced. - - set_ftrace_pid: Have the function tracer only trace a single thread. - - set_graph_function: Select the function where the trace have to start - with the function graph tracer (See the section - "dynamic ftrace" for more details). - - available_filter_functions: This lists the functions that ftrace - has processed and can trace. These are the function - names that you can pass to "set_ftrace_filter" or - "set_ftrace_notrace". (See the section "dynamic ftrace" - below for more details.) + current_tracer: + + This is used to set or display the current tracer + that is configured. + + available_tracers: + + This holds the different types of tracers that + have been compiled into the kernel. The + tracers listed here can be configured by + echoing their name into current_tracer. + + tracing_enabled: + + This sets or displays whether the current_tracer + is activated and tracing or not. Echo 0 into this + file to disable the tracer or 1 to enable it. + + trace: + + This file holds the output of the trace in a human + readable format (described below). + + latency_trace: + + This file shows the same trace but the information + is organized more to display possible latencies + in the system (described below). + + trace_pipe: + + The output is the same as the "trace" file but this + file is meant to be streamed with live tracing. + Reads from this file will block until new data + is retrieved. Unlike the "trace" and "latency_trace" + files, this file is a consumer. This means reading + from this file causes sequential reads to display + more current data. Once data is read from this + file, it is consumed, and will not be read + again with a sequential read. The "trace" and + "latency_trace" files are static, and if the + tracer is not adding more data, they will display + the same information every time they are read. + + trace_options: + + This file lets the user control the amount of data + that is displayed in one of the above output + files. + + trace_max_latency: + + Some of the tracers record the max latency. + For example, the time interrupts are disabled. + This time is saved in this file. The max trace + will also be stored, and displayed by either + "trace" or "latency_trace". A new max trace will + only be recorded if the latency is greater than + the value in this file. (in microseconds) + + buffer_size_kb: + + This sets or displays the number of kilobytes each CPU + buffer can hold. The tracer buffers are the same size + for each CPU. The displayed number is the size of the + CPU buffer and not total size of all buffers. The + trace buffers are allocated in pages (blocks of memory + that the kernel uses for allocation, usually 4 KB in size). + If the last page allocated has room for more bytes + than requested, the rest of the page will be used, + making the actual allocation bigger than requested. + ( Note, the size may not be a multiple of the page size + due to buffer managment overhead. ) + + This can only be updated when the current_tracer + is set to "nop". + + tracing_cpumask: + + This is a mask that lets the user only trace + on specified CPUS. The format is a hex string + representing the CPUS. + + set_ftrace_filter: + + When dynamic ftrace is configured in (see the + section below "dynamic ftrace"), the code is dynamically + modified (code text rewrite) to disable calling of the + function profiler (mcount). This lets tracing be configured + in with practically no overhead in performance. This also + has a side effect of enabling or disabling specific functions + to be traced. Echoing names of functions into this file + will limit the trace to only those functions. + + set_ftrace_notrace: + + This has an effect opposite to that of + set_ftrace_filter. Any function that is added here will not + be traced. If a function exists in both set_ftrace_filter + and set_ftrace_notrace, the function will _not_ be traced. + + set_ftrace_pid: + + Have the function tracer only trace a single thread. + + set_graph_function: + + Set a "trigger" function where tracing should start + with the function graph tracer (See the section + "dynamic ftrace" for more details). + + available_filter_functions: + + This lists the functions that ftrace + has processed and can trace. These are the function + names that you can pass to "set_ftrace_filter" or + "set_ftrace_notrace". (See the section "dynamic ftrace" + below for more details.) The Tracers @@ -145,44 +175,66 @@ The Tracers Here is the list of current tracers that may be configured. - function - function tracer that uses mcount to trace all functions. + "function" + + Function call tracer to trace all kernel functions. + + "function_graph_tracer" + + Similar to the function tracer except that the + function tracer probes the functions on their entry + whereas the function graph tracer traces on both entry + and exit of the functions. It then provides the ability + to draw a graph of function calls similar to C code + source. + + "sched_switch" + + Traces the context switches and wakeups between tasks. + + "irqsoff" + + Traces the areas that disable interrupts and saves + the trace with the longest max latency. + See tracing_max_latency. When a new max is recorded, + it replaces the old trace. It is best to view this + trace via the latency_trace file. - function_graph_tracer - similar to the function tracer except that the - function tracer probes the functions on their entry whereas the - function graph tracer traces on both entry and exit of the - functions. It then provides the ability to draw a graph of - function calls like a primitive C code source. + "preemptoff" - sched_switch - traces the context switches between tasks. + Similar to irqsoff but traces and records the amount of + time for which preemption is disabled. - irqsoff - traces the areas that disable interrupts and saves - the trace with the longest max latency. - See tracing_max_latency. When a new max is recorded, - it replaces the old trace. It is best to view this - trace via the latency_trace file. + "preemptirqsoff" - preemptoff - Similar to irqsoff but traces and records the amount of - time for which preemption is disabled. + Similar to irqsoff and preemptoff, but traces and + records the largest time for which irqs and/or preemption + is disabled. - preemptirqsoff - Similar to irqsoff and preemptoff, but traces and - records the largest time for which irqs and/or preemption - is disabled. + "wakeup" - wakeup - Traces and records the max latency that it takes for - the highest priority task to get scheduled after - it has been woken up. + Traces and records the max latency that it takes for + the highest priority task to get scheduled after + it has been woken up. - nop - This is not a tracer. To remove all tracers from tracing - simply echo "nop" into current_tracer. + "hw-branch-tracer" - hw-branch-tracer - traces branches on all cpu's in a circular buffer. + Uses the BTS CPU feature on x86 CPUs to traces all + branches executed. + + "nop" + + This is the "trace nothing" tracer. To remove all + tracers from tracing simply echo "nop" into + current_tracer. Examples of using the tracer ---------------------------- -Here are typical examples of using the tracers when controlling them only -with the debugfs interface (without using any user-land utilities). +Here are typical examples of using the tracers when controlling +them only with the debugfs interface (without using any +user-land utilities). Output format: -------------- @@ -199,16 +251,16 @@ Here is an example of the output format of the file "trace" bash-4251 [01] 10152.583855: _atomic_dec_and_lock <-dput -------- -A header is printed with the tracer name that is represented by the trace. -In this case the tracer is "function". Then a header showing the format. Task -name "bash", the task PID "4251", the CPU that it was running on -"01", the timestamp in . format, the function name that was -traced "path_put" and the parent function that called this function -"path_walk". The timestamp is the time at which the function was -entered. +A header is printed with the tracer name that is represented by +the trace. In this case the tracer is "function". Then a header +showing the format. Task name "bash", the task PID "4251", the +CPU that it was running on "01", the timestamp in . +format, the function name that was traced "path_put" and the +parent function that called this function "path_walk". The +timestamp is the time at which the function was entered. -The sched_switch tracer also includes tracing of task wakeups and -context switches. +The sched_switch tracer also includes tracing of task wakeups +and context switches. ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S @@ -217,8 +269,8 @@ context switches. kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R -Wake ups are represented by a "+" and the context switches are shown as -"==>". The format is: +Wake ups are represented by a "+" and the context switches are +shown as "==>". The format is: Context switches: @@ -232,19 +284,20 @@ Wake ups are represented by a "+" and the context switches are shown as :: + :: -The prio is the internal kernel priority, which is the inverse of the -priority that is usually displayed by user-space tools. Zero represents -the highest priority (99). Prio 100 starts the "nice" priorities with -100 being equal to nice -20 and 139 being nice 19. The prio "140" is -reserved for the idle task which is the lowest priority thread (pid 0). +The prio is the internal kernel priority, which is the inverse +of the priority that is usually displayed by user-space tools. +Zero represents the highest priority (99). Prio 100 starts the +"nice" priorities with 100 being equal to nice -20 and 139 being +nice 19. The prio "140" is reserved for the idle task which is +the lowest priority thread (pid 0). Latency trace format -------------------- -For traces that display latency times, the latency_trace file gives -somewhat more information to see why a latency happened. Here is a typical -trace. +For traces that display latency times, the latency_trace file +gives somewhat more information to see why a latency happened. +Here is a typical trace. # tracer: irqsoff # @@ -271,20 +324,20 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8 -0 0d.s1 98us : trace_hardirqs_on (do_softirq) +This shows that the current tracer is "irqsoff" tracing the time +for which interrupts were disabled. It gives the trace version +and the version of the kernel upon which this was executed on +(2.6.26-rc8). Then it displays the max latency in microsecs (97 +us). The number of trace entries displayed and the total number +recorded (both are three: #3/3). The type of preemption that was +used (PREEMPT). VP, KP, SP, and HP are always zero and are +reserved for later use. #P is the number of online CPUS (#P:2). -This shows that the current tracer is "irqsoff" tracing the time for which -interrupts were disabled. It gives the trace version and the version -of the kernel upon which this was executed on (2.6.26-rc8). Then it displays -the max latency in microsecs (97 us). The number of trace entries displayed -and the total number recorded (both are three: #3/3). The type of -preemption that was used (PREEMPT). VP, KP, SP, and HP are always zero -and are reserved for later use. #P is the number of online CPUS (#P:2). - -The task is the process that was running when the latency occurred. -(swapper pid: 0). +The task is the process that was running when the latency +occurred. (swapper pid: 0). -The start and stop (the functions in which the interrupts were disabled and -enabled respectively) that caused the latencies: +The start and stop (the functions in which the interrupts were +disabled and enabled respectively) that caused the latencies: apic_timer_interrupt is where the interrupts were disabled. do_softirq is where they were enabled again. @@ -320,12 +373,12 @@ The above is mostly meaningful for kernel developers. latency_trace file is relative to the start of the trace. delay: This is just to help catch your eye a bit better. And - needs to be fixed to be only relative to the same CPU. - The marks are determined by the difference between this - current trace and the next trace. - '!' - greater than preempt_mark_thresh (default 100) - '+' - greater than 1 microsecond - ' ' - less than or equal to 1 microsecond. + needs to be fixed to be only relative to the same CPU. + The marks are determined by the difference between this + current trace and the next trace. + '!' - greater than preempt_mark_thresh (default 100) + '+' - greater than 1 microsecond + ' ' - less than or equal to 1 microsecond. The rest is the same as the 'trace' file. @@ -333,14 +386,15 @@ The above is mostly meaningful for kernel developers. trace_options ------------- -The trace_options file is used to control what gets printed in the trace -output. To see what is available, simply cat the file: +The trace_options file is used to control what gets printed in +the trace output. To see what is available, simply cat the file: cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ - noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj + noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj -To disable one of the options, echo in the option prepended with "no". +To disable one of the options, echo in the option prepended with +"no". echo noprint-parent > /debug/tracing/trace_options @@ -350,8 +404,8 @@ To enable an option, leave off the "no". Here are the available options: - print-parent - On function traces, display the calling function - as well as the function being traced. + print-parent - On function traces, display the calling (parent) + function as well as the function being traced. print-parent: bash-4000 [01] 1477.606694: simple_strtoul <-strict_strtoul @@ -360,15 +414,16 @@ Here are the available options: bash-4000 [01] 1477.606694: simple_strtoul - sym-offset - Display not only the function name, but also the offset - in the function. For example, instead of seeing just - "ktime_get", you will see "ktime_get+0xb/0x20". + sym-offset - Display not only the function name, but also the + offset in the function. For example, instead of + seeing just "ktime_get", you will see + "ktime_get+0xb/0x20". sym-offset: bash-4000 [01] 1477.606694: simple_strtoul+0x6/0xa0 - sym-addr - this will also display the function address as well as - the function name. + sym-addr - this will also display the function address as well + as the function name. sym-addr: bash-4000 [01] 1477.606694: simple_strtoul @@ -378,35 +433,41 @@ Here are the available options: bash 4000 1 0 00000000 00010a95 [58127d26] 1720.415ms \ (+0.000ms): simple_strtoul (strict_strtoul) - raw - This will display raw numbers. This option is best for use with - user applications that can translate the raw numbers better than - having it done in the kernel. + raw - This will display raw numbers. This option is best for + use with user applications that can translate the raw + numbers better than having it done in the kernel. - hex - Similar to raw, but the numbers will be in a hexadecimal format. + hex - Similar to raw, but the numbers will be in a hexadecimal + format. bin - This will print out the formats in raw binary. block - TBD (needs update) - stacktrace - This is one of the options that changes the trace itself. - When a trace is recorded, so is the stack of functions. - This allows for back traces of trace sites. + stacktrace - This is one of the options that changes the trace + itself. When a trace is recorded, so is the stack + of functions. This allows for back traces of + trace sites. - userstacktrace - This option changes the trace. - It records a stacktrace of the current userspace thread. + userstacktrace - This option changes the trace. It records a + stacktrace of the current userspace thread. - sym-userobj - when user stacktrace are enabled, look up which object the - address belongs to, and print a relative address - This is especially useful when ASLR is on, otherwise you don't - get a chance to resolve the address to object/file/line after the app is no - longer running + sym-userobj - when user stacktrace are enabled, look up which + object the address belongs to, and print a + relative address. This is especially useful when + ASLR is on, otherwise you don't get a chance to + resolve the address to object/file/line after + the app is no longer running - The lookup is performed when you read trace,trace_pipe,latency_trace. Example: + The lookup is performed when you read + trace,trace_pipe,latency_trace. Example: a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0 x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] - sched-tree - TBD (any users??) + sched-tree - trace all tasks that are on the runqueue, at + every scheduling event. Will add overhead if + there's a lot of tasks running at once. sched_switch @@ -443,18 +504,19 @@ of how to use it. [...] -As we have discussed previously about this format, the header shows -the name of the trace and points to the options. The "FUNCTION" -is a misnomer since here it represents the wake ups and context -switches. +As we have discussed previously about this format, the header +shows the name of the trace and points to the options. The +"FUNCTION" is a misnomer since here it represents the wake ups +and context switches. -The sched_switch file only lists the wake ups (represented with '+') -and context switches ('==>') with the previous task or current task -first followed by the next task or task waking up. The format for both -of these is PID:KERNEL-PRIO:TASK-STATE. Remember that the KERNEL-PRIO -is the inverse of the actual priority with zero (0) being the highest -priority and the nice values starting at 100 (nice -20). Below is -a quick chart to map the kernel priority to user land priorities. +The sched_switch file only lists the wake ups (represented with +'+') and context switches ('==>') with the previous task or +current task first followed by the next task or task waking up. +The format for both of these is PID:KERNEL-PRIO:TASK-STATE. +Remember that the KERNEL-PRIO is the inverse of the actual +priority with zero (0) being the highest priority and the nice +values starting at 100 (nice -20). Below is a quick chart to map +the kernel priority to user land priorities. Kernel priority: 0 to 99 ==> user RT priority 99 to 0 Kernel priority: 100 to 139 ==> user nice -20 to 19 @@ -475,10 +537,10 @@ The task states are: ftrace_enabled -------------- -The following tracers (listed below) give different output depending -on whether or not the sysctl ftrace_enabled is set. To set ftrace_enabled, -one can either use the sysctl function or set it via the proc -file system interface. +The following tracers (listed below) give different output +depending on whether or not the sysctl ftrace_enabled is set. To +set ftrace_enabled, one can either use the sysctl function or +set it via the proc file system interface. sysctl kernel.ftrace_enabled=1 @@ -486,12 +548,12 @@ file system interface. echo 1 > /proc/sys/kernel/ftrace_enabled -To disable ftrace_enabled simply replace the '1' with '0' in -the above commands. +To disable ftrace_enabled simply replace the '1' with '0' in the +above commands. -When ftrace_enabled is set the tracers will also record the functions -that are within the trace. The descriptions of the tracers -will also show an example with ftrace enabled. +When ftrace_enabled is set the tracers will also record the +functions that are within the trace. The descriptions of the +tracers will also show an example with ftrace enabled. irqsoff @@ -499,17 +561,18 @@ irqsoff When interrupts are disabled, the CPU can not react to any other external event (besides NMIs and SMIs). This prevents the timer -interrupt from triggering or the mouse interrupt from letting the -kernel know of a new mouse event. The result is a latency with the -reaction time. +interrupt from triggering or the mouse interrupt from letting +the kernel know of a new mouse event. The result is a latency +with the reaction time. -The irqsoff tracer tracks the time for which interrupts are disabled. -When a new maximum latency is hit, the tracer saves the trace leading up -to that latency point so that every time a new maximum is reached, the old -saved trace is discarded and the new trace is saved. +The irqsoff tracer tracks the time for which interrupts are +disabled. When a new maximum latency is hit, the tracer saves +the trace leading up to that latency point so that every time a +new maximum is reached, the old saved trace is discarded and the +new trace is saved. -To reset the maximum, echo 0 into tracing_max_latency. Here is an -example: +To reset the maximum, echo 0 into tracing_max_latency. Here is +an example: # echo irqsoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -544,10 +607,11 @@ irqsoff latency trace v1.1.5 on 2.6.26 Here we see that that we had a latency of 12 microsecs (which is -very good). The _write_lock_irq in sys_setpgid disabled interrupts. -The difference between the 12 and the displayed timestamp 14us occurred -because the clock was incremented between the time of recording the max -latency and the time of recording the function that had that latency. +very good). The _write_lock_irq in sys_setpgid disabled +interrupts. The difference between the 12 and the displayed +timestamp 14us occurred because the clock was incremented +between the time of recording the max latency and the time of +recording the function that had that latency. Note the above example had ftrace_enabled not set. If we set the ftrace_enabled, we get a much larger output: @@ -598,24 +662,24 @@ irqsoff latency trace v1.1.5 on 2.6.26-rc8 Here we traced a 50 microsecond latency. But we also see all the -functions that were called during that time. Note that by enabling -function tracing, we incur an added overhead. This overhead may -extend the latency times. But nevertheless, this trace has provided -some very helpful debugging information. +functions that were called during that time. Note that by +enabling function tracing, we incur an added overhead. This +overhead may extend the latency times. But nevertheless, this +trace has provided some very helpful debugging information. preemptoff ---------- -When preemption is disabled, we may be able to receive interrupts but -the task cannot be preempted and a higher priority task must wait -for preemption to be enabled again before it can preempt a lower -priority task. +When preemption is disabled, we may be able to receive +interrupts but the task cannot be preempted and a higher +priority task must wait for preemption to be enabled again +before it can preempt a lower priority task. The preemptoff tracer traces the places that disable preemption. -Like the irqsoff tracer, it records the maximum latency for which preemption -was disabled. The control of preemptoff tracer is much like the irqsoff -tracer. +Like the irqsoff tracer, it records the maximum latency for +which preemption was disabled. The control of preemptoff tracer +is much like the irqsoff tracer. # echo preemptoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -649,11 +713,12 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 30us : trace_preempt_on (__do_softirq) -This has some more changes. Preemption was disabled when an interrupt -came in (notice the 'h'), and was enabled while doing a softirq. -(notice the 's'). But we also see that interrupts have been disabled -when entering the preempt off section and leaving it (the 'd'). -We do not know if interrupts were enabled in the mean time. +This has some more changes. Preemption was disabled when an +interrupt came in (notice the 'h'), and was enabled while doing +a softirq. (notice the 's'). But we also see that interrupts +have been disabled when entering the preempt off section and +leaving it (the 'd'). We do not know if interrupts were enabled +in the mean time. # tracer: preemptoff # @@ -712,28 +777,30 @@ preemptoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 64us : trace_preempt_on (__do_softirq) -The above is an example of the preemptoff trace with ftrace_enabled -set. Here we see that interrupts were disabled the entire time. -The irq_enter code lets us know that we entered an interrupt 'h'. -Before that, the functions being traced still show that it is not -in an interrupt, but we can see from the functions themselves that -this is not the case. +The above is an example of the preemptoff trace with +ftrace_enabled set. Here we see that interrupts were disabled +the entire time. The irq_enter code lets us know that we entered +an interrupt 'h'. Before that, the functions being traced still +show that it is not in an interrupt, but we can see from the +functions themselves that this is not the case. -Notice that __do_softirq when called does not have a preempt_count. -It may seem that we missed a preempt enabling. What really happened -is that the preempt count is held on the thread's stack and we -switched to the softirq stack (4K stacks in effect). The code -does not copy the preempt count, but because interrupts are disabled, -we do not need to worry about it. Having a tracer like this is good -for letting people know what really happens inside the kernel. +Notice that __do_softirq when called does not have a +preempt_count. It may seem that we missed a preempt enabling. +What really happened is that the preempt count is held on the +thread's stack and we switched to the softirq stack (4K stacks +in effect). The code does not copy the preempt count, but +because interrupts are disabled, we do not need to worry about +it. Having a tracer like this is good for letting people know +what really happens inside the kernel. preemptirqsoff -------------- -Knowing the locations that have interrupts disabled or preemption -disabled for the longest times is helpful. But sometimes we would -like to know when either preemption and/or interrupts are disabled. +Knowing the locations that have interrupts disabled or +preemption disabled for the longest times is helpful. But +sometimes we would like to know when either preemption and/or +interrupts are disabled. Consider the following code: @@ -753,11 +820,13 @@ The preemptoff tracer will record the total length of call_function_with_irqs_and_preemption_off() and call_function_with_preemption_off(). -But neither will trace the time that interrupts and/or preemption -is disabled. This total time is the time that we can not schedule. -To record this time, use the preemptirqsoff tracer. +But neither will trace the time that interrupts and/or +preemption is disabled. This total time is the time that we can +not schedule. To record this time, use the preemptirqsoff +tracer. -Again, using this trace is much like the irqsoff and preemptoff tracers. +Again, using this trace is much like the irqsoff and preemptoff +tracers. # echo preemptirqsoff > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -793,9 +862,10 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 The trace_hardirqs_off_thunk is called from assembly on x86 when -interrupts are disabled in the assembly code. Without the function -tracing, we do not know if interrupts were enabled within the preemption -points. We do see that it started with preemption enabled. +interrupts are disabled in the assembly code. Without the +function tracing, we do not know if interrupts were enabled +within the preemption points. We do see that it started with +preemption enabled. Here is a trace with ftrace_enabled set: @@ -883,40 +953,42 @@ preemptirqsoff latency trace v1.1.5 on 2.6.26-rc8 sshd-4261 0d.s1 105us : trace_preempt_on (__do_softirq) -This is a very interesting trace. It started with the preemption of -the ls task. We see that the task had the "need_resched" bit set -via the 'N' in the trace. Interrupts were disabled before the spin_lock -at the beginning of the trace. We see that a schedule took place to run -sshd. When the interrupts were enabled, we took an interrupt. -On return from the interrupt handler, the softirq ran. We took another -interrupt while running the softirq as we see from the capital 'H'. +This is a very interesting trace. It started with the preemption +of the ls task. We see that the task had the "need_resched" bit +set via the 'N' in the trace. Interrupts were disabled before +the spin_lock at the beginning of the trace. We see that a +schedule took place to run sshd. When the interrupts were +enabled, we took an interrupt. On return from the interrupt +handler, the softirq ran. We took another interrupt while +running the softirq as we see from the capital 'H'. wakeup ------ -In a Real-Time environment it is very important to know the wakeup -time it takes for the highest priority task that is woken up to the -time that it executes. This is also known as "schedule latency". -I stress the point that this is about RT tasks. It is also important -to know the scheduling latency of non-RT tasks, but the average -schedule latency is better for non-RT tasks. Tools like -LatencyTop are more appropriate for such measurements. +In a Real-Time environment it is very important to know the +wakeup time it takes for the highest priority task that is woken +up to the time that it executes. This is also known as "schedule +latency". I stress the point that this is about RT tasks. It is +also important to know the scheduling latency of non-RT tasks, +but the average schedule latency is better for non-RT tasks. +Tools like LatencyTop are more appropriate for such +measurements. Real-Time environments are interested in the worst case latency. -That is the longest latency it takes for something to happen, and -not the average. We can have a very fast scheduler that may only -have a large latency once in a while, but that would not work well -with Real-Time tasks. The wakeup tracer was designed to record -the worst case wakeups of RT tasks. Non-RT tasks are not recorded -because the tracer only records one worst case and tracing non-RT -tasks that are unpredictable will overwrite the worst case latency -of RT tasks. - -Since this tracer only deals with RT tasks, we will run this slightly -differently than we did with the previous tracers. Instead of performing -an 'ls', we will run 'sleep 1' under 'chrt' which changes the -priority of the task. +That is the longest latency it takes for something to happen, +and not the average. We can have a very fast scheduler that may +only have a large latency once in a while, but that would not +work well with Real-Time tasks. The wakeup tracer was designed +to record the worst case wakeups of RT tasks. Non-RT tasks are +not recorded because the tracer only records one worst case and +tracing non-RT tasks that are unpredictable will overwrite the +worst case latency of RT tasks. + +Since this tracer only deals with RT tasks, we will run this +slightly differently than we did with the previous tracers. +Instead of performing an 'ls', we will run 'sleep 1' under +'chrt' which changes the priority of the task. # echo wakeup > /debug/tracing/current_tracer # echo 0 > /debug/tracing/tracing_max_latency @@ -946,17 +1018,16 @@ wakeup latency trace v1.1.5 on 2.6.26-rc8 -0 1d..4 4us : schedule (cpu_idle) +Running this on an idle system, we see that it only took 4 +microseconds to perform the task switch. Note, since the trace +marker in the schedule is before the actual "switch", we stop +the tracing when the recorded task is about to schedule in. This +may change if we add a new marker at the end of the scheduler. -Running this on an idle system, we see that it only took 4 microseconds -to perform the task switch. Note, since the trace marker in the -schedule is before the actual "switch", we stop the tracing when -the recorded task is about to schedule in. This may change if -we add a new marker at the end of the scheduler. - -Notice that the recorded task is 'sleep' with the PID of 4901 and it -has an rt_prio of 5. This priority is user-space priority and not -the internal kernel priority. The policy is 1 for SCHED_FIFO and 2 -for SCHED_RR. +Notice that the recorded task is 'sleep' with the PID of 4901 +and it has an rt_prio of 5. This priority is user-space priority +and not the internal kernel priority. The policy is 1 for +SCHED_FIFO and 2 for SCHED_RR. Doing the same with chrt -r 5 and ftrace_enabled set. @@ -1013,24 +1084,25 @@ ksoftirq-7 1d..6 49us : _spin_unlock (tracing_record_cmdline) ksoftirq-7 1d..6 49us : sub_preempt_count (_spin_unlock) ksoftirq-7 1d..4 50us : schedule (__cond_resched) -The interrupt went off while running ksoftirqd. This task runs at -SCHED_OTHER. Why did not we see the 'N' set early? This may be -a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K stacks -configured, the interrupt and softirq run with their own stack. -Some information is held on the top of the task's stack (need_resched -and preempt_count are both stored there). The setting of the NEED_RESCHED -bit is done directly to the task's stack, but the reading of the -NEED_RESCHED is done by looking at the current stack, which in this case -is the stack for the hard interrupt. This hides the fact that NEED_RESCHED -has been set. We do not see the 'N' until we switch back to the task's +The interrupt went off while running ksoftirqd. This task runs +at SCHED_OTHER. Why did not we see the 'N' set early? This may +be a harmless bug with x86_32 and 4K stacks. On x86_32 with 4K +stacks configured, the interrupt and softirq run with their own +stack. Some information is held on the top of the task's stack +(need_resched and preempt_count are both stored there). The +setting of the NEED_RESCHED bit is done directly to the task's +stack, but the reading of the NEED_RESCHED is done by looking at +the current stack, which in this case is the stack for the hard +interrupt. This hides the fact that NEED_RESCHED has been set. +We do not see the 'N' until we switch back to the task's assigned stack. function -------- This tracer is the function tracer. Enabling the function tracer -can be done from the debug file system. Make sure the ftrace_enabled is -set; otherwise this tracer is a nop. +can be done from the debug file system. Make sure the +ftrace_enabled is set; otherwise this tracer is a nop. # sysctl kernel.ftrace_enabled=1 # echo function > /debug/tracing/current_tracer @@ -1060,14 +1132,15 @@ set; otherwise this tracer is a nop. [...] -Note: function tracer uses ring buffers to store the above entries. -The newest data may overwrite the oldest data. Sometimes using echo to -stop the trace is not sufficient because the tracing could have overwritten -the data that you wanted to record. For this reason, it is sometimes better to -disable tracing directly from a program. This allows you to stop the -tracing at the point that you hit the part that you are interested in. -To disable the tracing directly from a C program, something like following -code snippet can be used: +Note: function tracer uses ring buffers to store the above +entries. The newest data may overwrite the oldest data. +Sometimes using echo to stop the trace is not sufficient because +the tracing could have overwritten the data that you wanted to +record. For this reason, it is sometimes better to disable +tracing directly from a program. This allows you to stop the +tracing at the point that you hit the part that you are +interested in. To disable the tracing directly from a C program, +something like following code snippet can be used: int trace_fd; [...] @@ -1082,10 +1155,10 @@ int main(int argc, char *argv[]) { } Note: Here we hard coded the path name. The debugfs mount is not -guaranteed to be at /debug (and is more commonly at /sys/kernel/debug). -For simple one time traces, the above is sufficent. For anything else, -a search through /proc/mounts may be needed to find where the debugfs -file-system is mounted. +guaranteed to be at /debug (and is more commonly at +/sys/kernel/debug). For simple one time traces, the above is +sufficent. For anything else, a search through /proc/mounts may +be needed to find where the debugfs file-system is mounted. Single thread tracing @@ -1186,10 +1259,11 @@ following format: 0 scheduler_tick+0x1b6/0x1bf <- scheduler_tick+0x1aa/0x1bf -The tracer may be used to dump the trace for the oops'ing cpu on a -kernel oops into the system log. To enable this, ftrace_dump_on_oops -must be set. To set ftrace_dump_on_oops, one can either use the sysctl -function or set it via the proc system interface. +The tracer may be used to dump the trace for the oops'ing cpu on +a kernel oops into the system log. To enable this, +ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one +can either use the sysctl function or set it via the proc system +interface. sysctl kernel.ftrace_dump_on_oops=1 @@ -1198,8 +1272,8 @@ or echo 1 > /proc/sys/kernel/ftrace_dump_on_oops -Here's an example of such a dump after a null pointer dereference in a -kernel module: +Here's an example of such a dump after a null pointer +dereference in a kernel module: [57848.105921] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [57848.106019] IP: [] open+0x6/0x14 [oops] @@ -1239,25 +1313,34 @@ kernel module: function graph tracer --------------------------- -This tracer is similar to the function tracer except that it probes -a function on its entry and its exit. -This is done by setting a dynamically allocated stack of return addresses on each -task_struct. Then the tracer overwrites the return address of each function traced -to set a custom probe. Thus the original return address is stored on the stack of return -address in the task_struct. +This tracer is similar to the function tracer except that it +probes a function on its entry and its exit. This is done by +using a dynamically allocated stack of return addresses in each +task_struct. On function entry the tracer overwrites the return +address of each function traced to set a custom probe. Thus the +original return address is stored on the stack of return address +in the task_struct. -Probing on both extremities of a function leads to special features such as +Probing on both ends of a function leads to special features +such as: -_ measure of function's time execution -_ having a reliable call stack to draw function calls graph +- measure of a function's time execution +- having a reliable call stack to draw function calls graph This tracer is useful in several situations: -_ you want to find the reason of a strange kernel behavior and need to see - what happens in detail on any areas (or specific ones). -_ you are experiencing weird latencies but it's difficult to find its origin. -_ you want to find quickly which path is taken by a specific function -_ you just want to see what happens inside your kernel +- you want to find the reason of a strange kernel behavior and + need to see what happens in detail on any areas (or specific + ones). + +- you are experiencing weird latencies but it's difficult to + find its origin. + +- you want to find quickly which path is taken by a specific + function + +- you just want to peek inside a working kernel and want to see + what happens there. # tracer: function_graph # @@ -1282,24 +1365,28 @@ _ you just want to see what happens inside your kernel 0) 0.586 us | _spin_unlock(); -There are several columns that can be dynamically enabled/disabled. -You can use every combination of options you want, depending on your needs. +There are several columns that can be dynamically +enabled/disabled. You can use every combination of options you +want, depending on your needs. -_ The cpu number on which the function executed is default enabled. - It is sometimes better to only trace one cpu (see tracing_cpu_mask file) - or you might sometimes see unordered function calls while cpu tracing switch. +- The cpu number on which the function executed is default + enabled. It is sometimes better to only trace one cpu (see + tracing_cpu_mask file) or you might sometimes see unordered + function calls while cpu tracing switch. hide: echo nofuncgraph-cpu > /debug/tracing/trace_options show: echo funcgraph-cpu > /debug/tracing/trace_options -_ The duration (function's time of execution) is displayed on the closing bracket - line of a function or on the same line than the current function in case of a leaf - one. It is default enabled. +- The duration (function's time of execution) is displayed on + the closing bracket line of a function or on the same line + than the current function in case of a leaf one. It is default + enabled. hide: echo nofuncgraph-duration > /debug/tracing/trace_options show: echo funcgraph-duration > /debug/tracing/trace_options -_ The overhead field precedes the duration one in case of reached duration thresholds. +- The overhead field precedes the duration field in case of + reached duration thresholds. hide: echo nofuncgraph-overhead > /debug/tracing/trace_options show: echo funcgraph-overhead > /debug/tracing/trace_options @@ -1328,8 +1415,8 @@ _ The overhead field precedes the duration one in case of reached duration thres ! means that the function exceeded 100 usecs. -_ The task/pid field displays the thread cmdline and pid which executed the function. - It is default disabled. +- The task/pid field displays the thread cmdline and pid which + executed the function. It is default disabled. hide: echo nofuncgraph-proc > /debug/tracing/trace_options show: echo funcgraph-proc > /debug/tracing/trace_options @@ -1351,8 +1438,9 @@ _ The task/pid field displays the thread cmdline and pid which executed the func 0) sh-4802 | + 49.370 us | } -_ The absolute time field is an absolute timestamp given by the clock since - it started. A snapshot of this time is given on each entry/exit of functions +- The absolute time field is an absolute timestamp given by the + system clock since it started. A snapshot of this time is + given on each entry/exit of functions hide: echo nofuncgraph-abstime > /debug/tracing/trace_options show: echo funcgraph-abstime > /debug/tracing/trace_options @@ -1377,9 +1465,10 @@ _ The absolute time field is an absolute timestamp given by the clock since 360.774530 | 1) 0.594 us | __phys_addr(); -You can put some comments on specific functions by using ftrace_printk() -For example, if you want to put a comment inside the __might_sleep() function, -you just have to include and call ftrace_printk() inside __might_sleep() +You can put some comments on specific functions by using +ftrace_printk() For example, if you want to put a comment inside +the __might_sleep() function, you just have to include + and call ftrace_printk() inside __might_sleep() ftrace_printk("I'm a comment!\n") @@ -1390,8 +1479,9 @@ will produce: 1) 1.449 us | } -You might find other useful features for this tracer on the "dynamic ftrace" -section such as tracing only specific functions or tasks. +You might find other useful features for this tracer in the +following "dynamic ftrace" section such as tracing only specific +functions or tasks. dynamic ftrace -------------- @@ -1399,43 +1489,45 @@ dynamic ftrace If CONFIG_DYNAMIC_FTRACE is set, the system will run with virtually no overhead when function tracing is disabled. The way this works is the mcount function call (placed at the start of -every kernel function, produced by the -pg switch in gcc), starts -of pointing to a simple return. (Enabling FTRACE will include the --pg switch in the compiling of the kernel.) +every kernel function, produced by the -pg switch in gcc), +starts of pointing to a simple return. (Enabling FTRACE will +include the -pg switch in the compiling of the kernel.) At compile time every C file object is run through the recordmcount.pl script (located in the scripts directory). This script will process the C object using objdump to find all the -locations in the .text section that call mcount. (Note, only -the .text section is processed, since processing other sections -like .init.text may cause races due to those sections being freed). +locations in the .text section that call mcount. (Note, only the +.text section is processed, since processing other sections like +.init.text may cause races due to those sections being freed). -A new section called "__mcount_loc" is created that holds references -to all the mcount call sites in the .text section. This section is -compiled back into the original object. The final linker will add -all these references into a single table. +A new section called "__mcount_loc" is created that holds +references to all the mcount call sites in the .text section. +This section is compiled back into the original object. The +final linker will add all these references into a single table. On boot up, before SMP is initialized, the dynamic ftrace code -scans this table and updates all the locations into nops. It also -records the locations, which are added to the available_filter_functions -list. Modules are processed as they are loaded and before they are -executed. When a module is unloaded, it also removes its functions from -the ftrace function list. This is automatic in the module unload -code, and the module author does not need to worry about it. - -When tracing is enabled, kstop_machine is called to prevent races -with the CPUS executing code being modified (which can cause the -CPU to do undesireable things), and the nops are patched back -to calls. But this time, they do not call mcount (which is just -a function stub). They now call into the ftrace infrastructure. +scans this table and updates all the locations into nops. It +also records the locations, which are added to the +available_filter_functions list. Modules are processed as they +are loaded and before they are executed. When a module is +unloaded, it also removes its functions from the ftrace function +list. This is automatic in the module unload code, and the +module author does not need to worry about it. + +When tracing is enabled, kstop_machine is called to prevent +races with the CPUS executing code being modified (which can +cause the CPU to do undesireable things), and the nops are +patched back to calls. But this time, they do not call mcount +(which is just a function stub). They now call into the ftrace +infrastructure. One special side-effect to the recording of the functions being traced is that we can now selectively choose which functions we -wish to trace and which ones we want the mcount calls to remain as -nops. +wish to trace and which ones we want the mcount calls to remain +as nops. -Two files are used, one for enabling and one for disabling the tracing -of specified functions. They are: +Two files are used, one for enabling and one for disabling the +tracing of specified functions. They are: set_ftrace_filter @@ -1443,8 +1535,8 @@ and set_ftrace_notrace -A list of available functions that you can add to these files is listed -in: +A list of available functions that you can add to these files is +listed in: available_filter_functions @@ -1481,8 +1573,8 @@ hrtimer_interrupt sys_nanosleep -Perhaps this is not enough. The filters also allow simple wild cards. -Only the following are currently available +Perhaps this is not enough. The filters also allow simple wild +cards. Only the following are currently available * - will match functions that begin with * - will match functions that end with @@ -1492,9 +1584,9 @@ These are the only wild cards which are supported. * will not work. -Note: It is better to use quotes to enclose the wild cards, otherwise - the shell may expand the parameters into names of files in the local - directory. +Note: It is better to use quotes to enclose the wild cards, + otherwise the shell may expand the parameters into names + of files in the local directory. # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter @@ -1540,7 +1632,8 @@ This is because the '>' and '>>' act just like they do in bash. To rewrite the filters, use '>' To append to the filters, use '>>' -To clear out a filter so that all functions will be recorded again: +To clear out a filter so that all functions will be recorded +again: # echo > /debug/tracing/set_ftrace_filter # cat /debug/tracing/set_ftrace_filter @@ -1572,7 +1665,8 @@ hrtimer_get_res hrtimer_init_sleeper -The set_ftrace_notrace prevents those functions from being traced. +The set_ftrace_notrace prevents those functions from being +traced. # echo '*preempt*' '*lock*' > /debug/tracing/set_ftrace_notrace @@ -1595,18 +1689,20 @@ Produces: We can see that there's no more lock or preempt tracing. -* Dynamic ftrace with the function graph tracer * +Dynamic ftrace with the function graph tracer +--------------------------------------------- +Although what has been explained above concerns both the +function tracer and the function-graph-tracer, there are some +special features only available in the function-graph tracer. -Although what has been explained above concerns both the function tracer and -the function_graph_tracer, the following concerns only the latter. +If you want to trace only one function and all of its children, +you just have to echo its name into set_graph_function: -If you want to trace only one function and all of its childs, you just have -to echo its name on set_graph_function: + echo __do_fault > set_graph_function -echo __do_fault > set_graph_function - -will produce the following: +will produce the following "expanded" trace of the __do_fault() +function: 0) | __do_fault() { 0) | filemap_fault() { @@ -1643,23 +1739,24 @@ will produce the following: 0) 2.793 us | } 0) + 14.012 us | } -You can also select several functions: +You can also expand several functions at once: -echo sys_open > set_graph_function -echo sys_close >> set_graph_function + echo sys_open > set_graph_function + echo sys_close >> set_graph_function -Now if you want to go back to trace all functions +Now if you want to go back to trace all functions you can clear +this special filter via: -echo > set_graph_function + echo > set_graph_function trace_pipe ---------- -The trace_pipe outputs the same content as the trace file, but the effect -on the tracing is different. Every read from trace_pipe is consumed. -This means that subsequent reads will be different. The trace -is live. +The trace_pipe outputs the same content as the trace file, but +the effect on the tracing is different. Every read from +trace_pipe is consumed. This means that subsequent reads will be +different. The trace is live. # echo function > /debug/tracing/current_tracer # cat /debug/tracing/trace_pipe > /tmp/trace.out & @@ -1687,38 +1784,45 @@ is live. bash-4043 [00] 41.267111: select_task_rq_rt <-try_to_wake_up -Note, reading the trace_pipe file will block until more input is added. -By changing the tracer, trace_pipe will issue an EOF. We needed -to set the function tracer _before_ we "cat" the trace_pipe file. +Note, reading the trace_pipe file will block until more input is +added. By changing the tracer, trace_pipe will issue an EOF. We +needed to set the function tracer _before_ we "cat" the +trace_pipe file. trace entries ------------- -Having too much or not enough data can be troublesome in diagnosing -an issue in the kernel. The file buffer_size_kb is used to modify -the size of the internal trace buffers. The number listed -is the number of entries that can be recorded per CPU. To know -the full size, multiply the number of possible CPUS with the -number of entries. +Having too much or not enough data can be troublesome in +diagnosing an issue in the kernel. The file buffer_size_kb is +used to modify the size of the internal trace buffers. The +number listed is the number of entries that can be recorded per +CPU. To know the full size, multiply the number of possible CPUS +with the number of entries. # cat /debug/tracing/buffer_size_kb 1408 (units kilobytes) -Note, to modify this, you must have tracing completely disabled. To do that, -echo "nop" into the current_tracer. If the current_tracer is not set -to "nop", an EINVAL error will be returned. +Note, to modify this, you must have tracing completely disabled. +To do that, echo "nop" into the current_tracer. If the +current_tracer is not set to "nop", an EINVAL error will be +returned. # echo nop > /debug/tracing/current_tracer # echo 10000 > /debug/tracing/buffer_size_kb # cat /debug/tracing/buffer_size_kb 10000 (units kilobytes) -The number of pages which will be allocated is limited to a percentage -of available memory. Allocating too much will produce an error. +The number of pages which will be allocated is limited to a +percentage of available memory. Allocating too much will produce +an error. # echo 1000000000000 > /debug/tracing/buffer_size_kb -bash: echo: write error: Cannot allocate memory # cat /debug/tracing/buffer_size_kb 85 +----------- + +More details can be found in the source code, in the +kernel/tracing/*.c files. -- cgit v1.2.3-59-g8ed1b From 00a8bf859331e349713274825e6fbf20bf2ac15a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Feb 2009 13:01:37 +0100 Subject: tracing/function-graph-tracer: fix merge Merge artifact: pid got changed to ent->pid meanwhile. Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8f4004a00b4e..c009553a8e81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -589,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, /* Proc */ if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); + ret = print_graph_proc(s, ent->pid); if (ret == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3-59-g8ed1b From fe1200b63d158b28eef6d4de1e5b5f99c681ba2f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 17 Feb 2009 12:05:07 -0500 Subject: SLUB: Introduce and use SLUB_MAX_SIZE and SLUB_PAGE_SHIFT constants As a preparational patch to bump up page allocator pass-through threshold, introduce two new constants SLUB_MAX_SIZE and SLUB_PAGE_SHIFT and convert mm/slub.c to use them. Reported-by: "Zhang, Yanmin" Tested-by: "Zhang, Yanmin" Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 19 ++++++++++++++++--- mm/slub.c | 16 ++++++++-------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 2f5c16b1aacd..986e09dcfd8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -120,11 +120,24 @@ struct kmem_cache { #define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE) +/* + * Maximum kmalloc object size handled by SLUB. Larger object allocations + * are passed through to the page allocator. The page allocator "fastpath" + * is relatively slow so we need this value sufficiently high so that + * performance critical objects are allocated through the SLUB fastpath. + * + * This should be dropped to PAGE_SIZE / 2 once the page allocator + * "fastpath" becomes competitive with the slab allocator fastpaths. + */ +#define SLUB_MAX_SIZE (PAGE_SIZE) + +#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 1) + /* * We keep the general caches in an array of slab caches that are used for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; +extern struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT]; /* * Sorry that the following has to be that ugly but some versions of GCC @@ -212,7 +225,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { - if (size > PAGE_SIZE) + if (size > SLUB_MAX_SIZE) return kmalloc_large(size, flags); if (!(flags & SLUB_DMA)) { @@ -234,7 +247,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && - size <= PAGE_SIZE && !(flags & SLUB_DMA)) { + size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) { struct kmem_cache *s = kmalloc_slab(size); if (!s) diff --git a/mm/slub.c b/mm/slub.c index bdc9abb08a23..5a5e7f5bf799 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2475,7 +2475,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; +struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); static int __init setup_slub_min_order(char *str) @@ -2537,7 +2537,7 @@ panic: } #ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; +static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; static void sysfs_add_func(struct work_struct *w) { @@ -2658,7 +2658,7 @@ void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, flags); s = get_slab(size, flags); @@ -2686,7 +2686,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, flags, node); s = get_slab(size, flags); @@ -2985,7 +2985,7 @@ void __init kmem_cache_init(void) caches++; } - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { create_kmalloc_cache(&kmalloc_caches[i], "kmalloc", 1 << i, GFP_KERNEL); caches++; @@ -3022,7 +3022,7 @@ void __init kmem_cache_init(void) slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) kmalloc_caches[i]. name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); @@ -3222,7 +3222,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large(size, gfpflags); s = get_slab(size, gfpflags); @@ -3238,7 +3238,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, { struct kmem_cache *s; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(size > SLUB_MAX_SIZE)) return kmalloc_large_node(size, gfpflags, node); s = get_slab(size, gfpflags); -- cgit v1.2.3-59-g8ed1b From f9349a8f978929a0c71d2c42ae299f7d462c239d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Feb 2009 21:13:12 +0100 Subject: tracing/function-graph-tracer: make set_graph_function file support ftrace regex Impact: trace only functions matching a pattern The set_graph_function file let one to trace only one or several chosen functions and follow all their code flow. Currently, only a constant function name is allowed so this patch allows the ftrace_regex functions: - matches all functions that end with "name": echo *name > set_graph_function - matches all functions that begin with "name": echo name* > set_graph_function - matches all functions that contains "name": echo *name* > set_graph_function Example: echo mutex* > set_graph_function 0) | mutex_lock_nested() { 0) 0.563 us | __might_sleep(); 0) 2.072 us | } 0) | mutex_unlock() { 0) 1.036 us | __mutex_unlock_slowpath(); 0) 2.433 us | } 0) | mutex_unlock() { 0) 0.691 us | __mutex_unlock_slowpath(); 0) 1.787 us | } 0) | mutex_lock_interruptible_nested() { 0) 0.548 us | __might_sleep(); 0) 1.945 us | } Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 56 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7dd5a2bef9cd..cf59f4c54745 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1895,6 +1895,10 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); + /* Nothing, tell g_show to print all functions are enabled */ + if (!ftrace_graph_count && !*pos) + return (void *)1; + p = g_next(m, p, pos); return p; @@ -1913,6 +1917,11 @@ static int g_show(struct seq_file *m, void *v) if (!ptr) return 0; + if (ptr == (unsigned long *)1) { + seq_printf(m, "#### all functions enabled ####\n"); + return 0; + } + kallsyms_lookup(*ptr, NULL, NULL, NULL, str); seq_printf(m, "%s\n", str); @@ -1966,38 +1975,51 @@ ftrace_graph_read(struct file *file, char __user *ubuf, } static int -ftrace_set_func(unsigned long *array, int idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, char *buffer) { - char str[KSYM_SYMBOL_LEN]; struct dyn_ftrace *rec; struct ftrace_page *pg; + int search_len; int found = 0; - int j; + int type, not; + char *search; + bool exists; + int i; if (ftrace_disabled) return -ENODEV; + /* decode regex */ + type = ftrace_setup_glob(buffer, strlen(buffer), &search, ¬); + if (not) + return -EINVAL; + + search_len = strlen(search); + mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + break; + if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) continue; - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - if (strcmp(str, buffer) == 0) { - /* Return 1 if we add it to the array */ - found = 1; - for (j = 0; j < idx; j++) - if (array[j] == rec->ip) { - found = 0; + if (ftrace_match_record(rec, search, search_len, type)) { + /* ensure it is not already in the array */ + exists = false; + for (i = 0; i < *idx; i++) + if (array[i] == rec->ip) { + exists = true; break; } - if (found) - array[idx] = rec->ip; - goto out; + if (!exists) { + array[(*idx)++] = rec->ip; + found = 1; + } } } while_for_each_ftrace_rec(); - out: + mutex_unlock(&ftrace_lock); return found ? 0 : -EINVAL; @@ -2066,13 +2088,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, } buffer[index] = 0; - /* we allow only one at a time */ - ret = ftrace_set_func(array, ftrace_graph_count, buffer); + /* we allow only one expression at a time */ + ret = ftrace_set_func(array, &ftrace_graph_count, buffer); if (ret) goto out; - ftrace_graph_count++; - file->f_pos += read; ret = read; -- cgit v1.2.3-59-g8ed1b From 1b24bc3aaba899f4e7f681cbbc3b93cadcf4d770 Mon Sep 17 00:00:00 2001 From: Corentin Chary Date: Thu, 5 Feb 2009 22:25:52 +0100 Subject: UBI: add fsync capability Now, we can call fsync() on an UBI volume. Signed-off-by: Corentin Chary Signed-off-by: Artem Bityutskiy --- drivers/mtd/ubi/cdev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index e63c8fc3df3a..f8e0f68f2186 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -186,6 +186,16 @@ static loff_t vol_cdev_llseek(struct file *file, loff_t offset, int origin) return new_offset; } +static int vol_cdev_fsync(struct file *file, struct dentry *dentry, + int datasync) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_device *ubi = desc->vol->ubi; + + return ubi_sync(ubi->ubi_num); +} + + static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count, loff_t *offp) { @@ -1073,6 +1083,7 @@ const struct file_operations ubi_vol_cdev_operations = { .llseek = vol_cdev_llseek, .read = vol_cdev_read, .write = vol_cdev_write, + .fsync = vol_cdev_fsync, .unlocked_ioctl = vol_cdev_ioctl, .compat_ioctl = vol_cdev_compat_ioctl, }; -- cgit v1.2.3-59-g8ed1b From 6503e5df08008b9a47022b5e9ebba658c8fa69af Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 27 Nov 2008 17:48:13 +0000 Subject: thermal: use integers rather than strings for thermal values The thermal API currently uses strings to pass values to userspace. This makes it difficult to use from within the kernel. Change the interface to use integers and fix up the consumers. Signed-off-by: Matthew Garrett Acked-by: Zhang Rui Acked-by: Thomas Renninger Signed-off-by: Len Brown --- drivers/acpi/fan.c | 20 ++++---- drivers/acpi/processor_thermal.c | 20 ++++---- drivers/acpi/thermal.c | 80 ++++++++++++++++++++------------ drivers/acpi/video.c | 22 +++++---- drivers/platform/x86/intel_menlow.c | 29 ++++-------- drivers/thermal/thermal_sys.c | 91 +++++++++++++++++++++++++++++++------ include/linux/thermal.h | 32 +++++++++---- 7 files changed, 198 insertions(+), 96 deletions(-) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index eaaee1660bdf..ae41cf3cf4e5 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -68,31 +68,35 @@ static struct acpi_driver acpi_fan_driver = { }; /* thermal cooling device callbacks */ -static int fan_get_max_state(struct thermal_cooling_device *cdev, char *buf) +static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long + *state) { /* ACPI fan device only support two states: ON/OFF */ - return sprintf(buf, "1\n"); + *state = 1; + return 0; } -static int fan_get_cur_state(struct thermal_cooling_device *cdev, char *buf) +static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long + *state) { struct acpi_device *device = cdev->devdata; - int state; int result; + int acpi_state; if (!device) return -EINVAL; - result = acpi_bus_get_power(device->handle, &state); + result = acpi_bus_get_power(device->handle, &acpi_state); if (result) return result; - return sprintf(buf, "%s\n", state == ACPI_STATE_D3 ? "0" : - (state == ACPI_STATE_D0 ? "1" : "unknown")); + *state = (acpi_state == ACPI_STATE_D3 ? 0 : + (acpi_state == ACPI_STATE_D0 ? 1 : -1)); + return 0; } static int -fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state) +fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) { struct acpi_device *device = cdev->devdata; int result; diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index b1eb376fae45..0e47e299a9ac 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -373,7 +373,8 @@ static int acpi_processor_max_state(struct acpi_processor *pr) return max_state; } static int -processor_get_max_state(struct thermal_cooling_device *cdev, char *buf) +processor_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *state) { struct acpi_device *device = cdev->devdata; struct acpi_processor *pr = acpi_driver_data(device); @@ -381,28 +382,29 @@ processor_get_max_state(struct thermal_cooling_device *cdev, char *buf) if (!device || !pr) return -EINVAL; - return sprintf(buf, "%d\n", acpi_processor_max_state(pr)); + *state = acpi_processor_max_state(pr); + return 0; } static int -processor_get_cur_state(struct thermal_cooling_device *cdev, char *buf) +processor_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *cur_state) { struct acpi_device *device = cdev->devdata; struct acpi_processor *pr = acpi_driver_data(device); - int cur_state; if (!device || !pr) return -EINVAL; - cur_state = cpufreq_get_cur_state(pr->id); + *cur_state = cpufreq_get_cur_state(pr->id); if (pr->flags.throttling) - cur_state += pr->throttling.state; - - return sprintf(buf, "%d\n", cur_state); + *cur_state += pr->throttling.state; + return 0; } static int -processor_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state) +processor_set_cur_state(struct thermal_cooling_device *cdev, + unsigned long state) { struct acpi_device *device = cdev->devdata; struct acpi_processor *pr = acpi_driver_data(device); diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 99e6f1f8ea45..1c410ef859c6 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -954,7 +954,8 @@ static void acpi_thermal_check(void *data) /* sys I/F for generic thermal sysfs support */ #define KELVIN_TO_MILLICELSIUS(t) (t * 100 - 273200) -static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf) +static int thermal_get_temp(struct thermal_zone_device *thermal, + unsigned long *temp) { struct acpi_thermal *tz = thermal->devdata; int result; @@ -966,25 +967,28 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, char *buf) if (result) return result; - return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS(tz->temperature)); + *temp = KELVIN_TO_MILLICELSIUS(tz->temperature); + return 0; } static const char enabled[] = "kernel"; static const char disabled[] = "user"; static int thermal_get_mode(struct thermal_zone_device *thermal, - char *buf) + enum thermal_device_mode *mode) { struct acpi_thermal *tz = thermal->devdata; if (!tz) return -EINVAL; - return sprintf(buf, "%s\n", tz->tz_enabled ? - enabled : disabled); + *mode = tz->tz_enabled ? THERMAL_DEVICE_ENABLED : + THERMAL_DEVICE_DISABLED; + + return 0; } static int thermal_set_mode(struct thermal_zone_device *thermal, - const char *buf) + enum thermal_device_mode mode) { struct acpi_thermal *tz = thermal->devdata; int enable; @@ -995,9 +999,9 @@ static int thermal_set_mode(struct thermal_zone_device *thermal, /* * enable/disable thermal management from ACPI thermal driver */ - if (!strncmp(buf, enabled, sizeof enabled - 1)) + if (mode == THERMAL_DEVICE_ENABLED) enable = 1; - else if (!strncmp(buf, disabled, sizeof disabled - 1)) + else if (mode == THERMAL_DEVICE_DISABLED) enable = 0; else return -EINVAL; @@ -1013,7 +1017,7 @@ static int thermal_set_mode(struct thermal_zone_device *thermal, } static int thermal_get_trip_type(struct thermal_zone_device *thermal, - int trip, char *buf) + int trip, enum thermal_trip_type *type) { struct acpi_thermal *tz = thermal->devdata; int i; @@ -1022,27 +1026,35 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal, return -EINVAL; if (tz->trips.critical.flags.valid) { - if (!trip) - return sprintf(buf, "critical\n"); + if (!trip) { + *type = THERMAL_TRIP_CRITICAL; + return 0; + } trip--; } if (tz->trips.hot.flags.valid) { - if (!trip) - return sprintf(buf, "hot\n"); + if (!trip) { + *type = THERMAL_TRIP_HOT; + return 0; + } trip--; } if (tz->trips.passive.flags.valid) { - if (!trip) - return sprintf(buf, "passive\n"); + if (!trip) { + *type = THERMAL_TRIP_PASSIVE; + return 0; + } trip--; } for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { - if (!trip) - return sprintf(buf, "active%d\n", i); + if (!trip) { + *type = THERMAL_TRIP_ACTIVE; + return 0; + } trip--; } @@ -1050,7 +1062,7 @@ static int thermal_get_trip_type(struct thermal_zone_device *thermal, } static int thermal_get_trip_temp(struct thermal_zone_device *thermal, - int trip, char *buf) + int trip, unsigned long *temp) { struct acpi_thermal *tz = thermal->devdata; int i; @@ -1059,31 +1071,39 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, return -EINVAL; if (tz->trips.critical.flags.valid) { - if (!trip) - return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS( - tz->trips.critical.temperature)); + if (!trip) { + *temp = KELVIN_TO_MILLICELSIUS( + tz->trips.critical.temperature); + return 0; + } trip--; } if (tz->trips.hot.flags.valid) { - if (!trip) - return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS( - tz->trips.hot.temperature)); + if (!trip) { + *temp = KELVIN_TO_MILLICELSIUS( + tz->trips.hot.temperature); + return 0; + } trip--; } if (tz->trips.passive.flags.valid) { - if (!trip) - return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS( - tz->trips.passive.temperature)); + if (!trip) { + *temp = KELVIN_TO_MILLICELSIUS( + tz->trips.passive.temperature); + return 0; + } trip--; } for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { - if (!trip) - return sprintf(buf, "%ld\n", KELVIN_TO_MILLICELSIUS( - tz->trips.active[i].temperature)); + if (!trip) { + *temp = KELVIN_TO_MILLICELSIUS( + tz->trips.active[i].temperature); + return 0; + } trip--; } diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index bb5ed059114a..5259d502add6 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -358,32 +358,36 @@ static struct output_properties acpi_output_properties = { /* thermal cooling device callbacks */ -static int video_get_max_state(struct thermal_cooling_device *cdev, char *buf) +static int video_get_max_state(struct thermal_cooling_device *cdev, unsigned + long *state) { struct acpi_device *device = cdev->devdata; struct acpi_video_device *video = acpi_driver_data(device); - return sprintf(buf, "%d\n", video->brightness->count - 3); + *state = video->brightness->count - 3; + return 0; } -static int video_get_cur_state(struct thermal_cooling_device *cdev, char *buf) +static int video_get_cur_state(struct thermal_cooling_device *cdev, unsigned + long *state) { struct acpi_device *device = cdev->devdata; struct acpi_video_device *video = acpi_driver_data(device); unsigned long long level; - int state; + int offset; acpi_video_device_lcd_get_level_current(video, &level); - for (state = 2; state < video->brightness->count; state++) - if (level == video->brightness->levels[state]) - return sprintf(buf, "%d\n", - video->brightness->count - state - 1); + for (offset = 2; offset < video->brightness->count; offset++) + if (level == video->brightness->levels[offset]) { + *state = video->brightness->count - offset - 1; + return 0; + } return -EINVAL; } static int -video_set_cur_state(struct thermal_cooling_device *cdev, unsigned int state) +video_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) { struct acpi_device *device = cdev->devdata; struct acpi_video_device *video = acpi_driver_data(device); diff --git a/drivers/platform/x86/intel_menlow.c b/drivers/platform/x86/intel_menlow.c index 27b7662955bb..29432a50be45 100644 --- a/drivers/platform/x86/intel_menlow.c +++ b/drivers/platform/x86/intel_menlow.c @@ -57,8 +57,8 @@ MODULE_LICENSE("GPL"); * In that case max_cstate would be n-1 * GTHS returning '0' would mean that no bandwidth control states are supported */ -static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev, - unsigned long *max_state) +static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev, + unsigned long *max_state) { struct acpi_device *device = cdev->devdata; acpi_handle handle = device->handle; @@ -83,22 +83,12 @@ static int memory_get_int_max_bandwidth(struct thermal_cooling_device *cdev, return 0; } -static int memory_get_max_bandwidth(struct thermal_cooling_device *cdev, - char *buf) -{ - unsigned long value; - if (memory_get_int_max_bandwidth(cdev, &value)) - return -EINVAL; - - return sprintf(buf, "%ld\n", value); -} - static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev, - char *buf) + unsigned long *value) { struct acpi_device *device = cdev->devdata; acpi_handle handle = device->handle; - unsigned long long value; + unsigned long long result; struct acpi_object_list arg_list; union acpi_object arg; acpi_status status = AE_OK; @@ -108,15 +98,16 @@ static int memory_get_cur_bandwidth(struct thermal_cooling_device *cdev, arg.type = ACPI_TYPE_INTEGER; arg.integer.value = MEMORY_ARG_CUR_BANDWIDTH; status = acpi_evaluate_integer(handle, MEMORY_GET_BANDWIDTH, - &arg_list, &value); + &arg_list, &result); if (ACPI_FAILURE(status)) return -EFAULT; - return sprintf(buf, "%llu\n", value); + *value = result; + return 0; } static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev, - unsigned int state) + unsigned long state) { struct acpi_device *device = cdev->devdata; acpi_handle handle = device->handle; @@ -126,7 +117,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev, unsigned long long temp; unsigned long max_state; - if (memory_get_int_max_bandwidth(cdev, &max_state)) + if (memory_get_max_bandwidth(cdev, &max_state)) return -EFAULT; if (state > max_state) @@ -142,7 +133,7 @@ static int memory_set_cur_bandwidth(struct thermal_cooling_device *cdev, &temp); printk(KERN_INFO - "Bandwidth value was %d: status is %d\n", state, status); + "Bandwidth value was %ld: status is %d\n", state, status); if (ACPI_FAILURE(status)) return -EFAULT; diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index 8171ca17b936..bd139adc6d32 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -104,22 +104,36 @@ static ssize_t temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); + long temperature; + int ret; if (!tz->ops->get_temp) return -EPERM; - return tz->ops->get_temp(tz, buf); + ret = tz->ops->get_temp(tz, &temperature); + + if (ret) + return ret; + + return sprintf(buf, "%ld\n", temperature); } static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); + enum thermal_device_mode mode; + int result; if (!tz->ops->get_mode) return -EPERM; - return tz->ops->get_mode(tz, buf); + result = tz->ops->get_mode(tz, &mode); + if (result) + return result; + + return sprintf(buf, "%s\n", mode == THERMAL_DEVICE_ENABLED ? "enabled" + : "disabled"); } static ssize_t @@ -132,7 +146,13 @@ mode_store(struct device *dev, struct device_attribute *attr, if (!tz->ops->set_mode) return -EPERM; - result = tz->ops->set_mode(tz, buf); + if (!strncmp(buf, "enabled", sizeof("enabled"))) + result = tz->ops->set_mode(tz, THERMAL_DEVICE_ENABLED); + else if (!strncmp(buf, "disabled", sizeof("disabled"))) + result = tz->ops->set_mode(tz, THERMAL_DEVICE_DISABLED); + else + result = -EINVAL; + if (result) return result; @@ -144,7 +164,8 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - int trip; + enum thermal_trip_type type; + int trip, result; if (!tz->ops->get_trip_type) return -EPERM; @@ -152,7 +173,22 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr, if (!sscanf(attr->attr.name, "trip_point_%d_type", &trip)) return -EINVAL; - return tz->ops->get_trip_type(tz, trip, buf); + result = tz->ops->get_trip_type(tz, trip, &type); + if (result) + return result; + + switch (type) { + case THERMAL_TRIP_CRITICAL: + return sprintf(buf, "critical"); + case THERMAL_TRIP_HOT: + return sprintf(buf, "hot"); + case THERMAL_TRIP_PASSIVE: + return sprintf(buf, "passive"); + case THERMAL_TRIP_ACTIVE: + return sprintf(buf, "active"); + default: + return sprintf(buf, "unknown"); + } } static ssize_t @@ -160,7 +196,8 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_zone_device *tz = to_thermal_zone(dev); - int trip; + int trip, ret; + long temperature; if (!tz->ops->get_trip_temp) return -EPERM; @@ -168,7 +205,12 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr, if (!sscanf(attr->attr.name, "trip_point_%d_temp", &trip)) return -EINVAL; - return tz->ops->get_trip_temp(tz, trip, buf); + ret = tz->ops->get_trip_temp(tz, trip, &temperature); + + if (ret) + return ret; + + return sprintf(buf, "%ld\n", temperature); } static DEVICE_ATTR(type, 0444, type_show, NULL); @@ -236,8 +278,13 @@ thermal_cooling_device_max_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); + unsigned long state; + int ret; - return cdev->ops->get_max_state(cdev, buf); + ret = cdev->ops->get_max_state(cdev, &state); + if (ret) + return ret; + return sprintf(buf, "%ld\n", state); } static ssize_t @@ -245,8 +292,13 @@ thermal_cooling_device_cur_state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct thermal_cooling_device *cdev = to_cooling_device(dev); + unsigned long state; + int ret; - return cdev->ops->get_cur_state(cdev, buf); + ret = cdev->ops->get_cur_state(cdev, &state); + if (ret) + return ret; + return sprintf(buf, "%ld\n", state); } static ssize_t @@ -255,10 +307,10 @@ thermal_cooling_device_cur_state_store(struct device *dev, const char *buf, size_t count) { struct thermal_cooling_device *cdev = to_cooling_device(dev); - int state; + unsigned long state; int result; - if (!sscanf(buf, "%d\n", &state)) + if (!sscanf(buf, "%ld\n", &state)) return -EINVAL; if (state < 0) @@ -312,13 +364,20 @@ static DEVICE_ATTR(name, 0444, name_show, NULL); static ssize_t temp_input_show(struct device *dev, struct device_attribute *attr, char *buf) { + long temperature; + int ret; struct thermal_hwmon_attr *hwmon_attr = container_of(attr, struct thermal_hwmon_attr, attr); struct thermal_zone_device *tz = container_of(hwmon_attr, struct thermal_zone_device, temp_input); - return tz->ops->get_temp(tz, buf); + ret = tz->ops->get_temp(tz, &temperature); + + if (ret) + return ret; + + return sprintf(buf, "%ld\n", temperature); } static ssize_t @@ -330,8 +389,14 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, struct thermal_zone_device *tz = container_of(hwmon_attr, struct thermal_zone_device, temp_crit); + long temperature; + int ret; + + ret = tz->ops->get_trip_temp(tz, 0, &temperature); + if (ret) + return ret; - return tz->ops->get_trip_temp(tz, 0, buf); + return sprintf(buf, "%ld\n", temperature); } diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 917707e6151d..4cb3292fb6e4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -31,23 +31,39 @@ struct thermal_zone_device; struct thermal_cooling_device; +enum thermal_device_mode { + THERMAL_DEVICE_DISABLED = 0, + THERMAL_DEVICE_ENABLED, +}; + +enum thermal_trip_type { + THERMAL_TRIP_ACTIVE = 0, + THERMAL_TRIP_PASSIVE, + THERMAL_TRIP_HOT, + THERMAL_TRIP_CRITICAL, +}; + struct thermal_zone_device_ops { int (*bind) (struct thermal_zone_device *, struct thermal_cooling_device *); int (*unbind) (struct thermal_zone_device *, struct thermal_cooling_device *); - int (*get_temp) (struct thermal_zone_device *, char *); - int (*get_mode) (struct thermal_zone_device *, char *); - int (*set_mode) (struct thermal_zone_device *, const char *); - int (*get_trip_type) (struct thermal_zone_device *, int, char *); - int (*get_trip_temp) (struct thermal_zone_device *, int, char *); + int (*get_temp) (struct thermal_zone_device *, unsigned long *); + int (*get_mode) (struct thermal_zone_device *, + enum thermal_device_mode *); + int (*set_mode) (struct thermal_zone_device *, + enum thermal_device_mode); + int (*get_trip_type) (struct thermal_zone_device *, int, + enum thermal_trip_type *); + int (*get_trip_temp) (struct thermal_zone_device *, int, + unsigned long *); int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *); }; struct thermal_cooling_device_ops { - int (*get_max_state) (struct thermal_cooling_device *, char *); - int (*get_cur_state) (struct thermal_cooling_device *, char *); - int (*set_cur_state) (struct thermal_cooling_device *, unsigned int); + int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); + int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); + int (*set_cur_state) (struct thermal_cooling_device *, unsigned long); }; #define THERMAL_TRIPS_NONE -1 -- cgit v1.2.3-59-g8ed1b From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 13:35:06 -0500 Subject: ftrace: allow archs to preform pre and post process for code modification This patch creates the weak functions: ftrace_arch_code_modify_prepare and ftrace_arch_code_modify_post_process that are called before and after the stop machine is called to modify the kernel text. If the arch needs to do pre or post processing, it only needs to define these functions. [ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_* over using ftrace_arch_modify_* ] Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +++ kernel/trace/ftrace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 677432b9cb7e..fdb2a89ae543 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write, /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ #include +int ftrace_arch_code_modify_prepare(void); +int ftrace_arch_code_modify_post_process(void); + enum { FTRACE_FL_FREE = (1 << 0), FTRACE_FL_FAILED = (1 << 1), diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fdf913dfc7e8..72316d9647bd 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) return 1; } +/* + * archs can override this function if they must do something + * before the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_prepare(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * after the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_post_process(void) +{ + return 0; +} + static int __ftrace_modify_code(void *data) { int *command = data; @@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { + int ret; + + ret = ftrace_arch_code_modify_prepare(); + FTRACE_WARN_ON(ret); + if (ret) + return; + stop_machine(__ftrace_modify_code, &command, NULL); + + ret = ftrace_arch_code_modify_post_process(); + FTRACE_WARN_ON(ret); } static ftrace_func_t saved_ftrace_func; -- cgit v1.2.3-59-g8ed1b From 91f73f90d97fa67effbb49e0a79c50cf26dfe324 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 20 Feb 2009 17:34:06 +0100 Subject: tracing/markers: make markers select tracepoints Sometimes it happens that KConfig dependencies are not handled like in the following scenario: - config A bool - config B bool depends on A - config C bool select B If one selects C, then it will select B without checking its dependency to A, if A hasn't been selected elsewhere, it will result in a build failure. This is what happens on the following build error: kernel/built-in.o: In function `marker_update_probe_range': (.text+0x52f64): undefined reference to `tracepoint_probe_register_noupdate' kernel/built-in.o: In function `marker_update_probe_range': (.text+0x52f74): undefined reference to `tracepoint_probe_unregister_noupdate' kernel/built-in.o: In function `marker_update_probe_range': (.text+0x52fb9): undefined reference to `tracepoint_probe_unregister_noupdate' kernel/built-in.o: In function `marker_update_probes': marker.c:(.text+0x530ba): undefined reference to `tracepoint_probe_update_all' CONFIG_KVM_TRACE will select CONFIG_MARKER, but the latter depends on CONFIG_TRACEPOINTS which will not be selected. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index f068071fcc5d..26b5bab6f6e8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -945,7 +945,7 @@ config TRACEPOINTS config MARKERS bool "Activate markers" - depends on TRACEPOINTS + select TRACEPOINTS help Place an empty function call at each marker site. Can be dynamically changed for a probe function. -- cgit v1.2.3-59-g8ed1b From 16239630974516a8879a3695ee9b4dc661f79f96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Feb 2009 17:57:30 -0500 Subject: ftrace, x86: make kernel text writable only for conversions Impact: keep kernel text read only Because dynamic ftrace converts the calls to mcount into and out of nops at run time, we needed to always keep the kernel text writable. But this defeats the point of CONFIG_DEBUG_RODATA. This patch converts the kernel code to writable before ftrace modifies the text, and converts it back to read only afterward. The kernel text is converted to read/write, stop_machine is called to modify the code, then the kernel text is converted back to read only. The original version used SYSTEM_STATE to determine when it was OK or not to change the code to rw or ro. Andrew Morton pointed out that using SYSTEM_STATE is a bad idea since there is no guarantee to what its state will actually be. Instead, I moved the check into the set_kernel_text_* functions themselves, and use a local variable to determine when it is OK to change the kernel text RW permissions. [ Update: Ingo Molnar suggested moving the prototypes to cacheflush.h ] Reviewed-by: Andrew Morton Signed-off-by: Steven Rostedt --- arch/x86/include/asm/cacheflush.h | 5 +++++ arch/x86/kernel/ftrace.c | 13 +++++++++++++ arch/x86/mm/init_32.c | 35 ++++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 37 ++++++++++++++++++++++++++++++++----- 4 files changed, 82 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 2f8466540fb5..6145063cfe0e 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -104,6 +104,11 @@ void clflush_cache_range(void *addr, unsigned int size); #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +void set_kernel_text_rw(void); +void set_kernel_text_ro(void); +#else +static inline void set_kernel_text_rw(void) { } +static inline void set_kernel_text_ro(void) { } #endif #ifdef CONFIG_DEBUG_RODATA_TEST diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 231bdd3c5b1c..77857d4f7d0f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,18 @@ #ifdef CONFIG_DYNAMIC_FTRACE +int ftrace_arch_code_modify_prepare(void) +{ + set_kernel_text_rw(); + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + set_kernel_text_ro(); + return 0; +} + union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef05074413..3eb2ed188a4c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1155,17 +1155,47 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, start+size); + + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, start+size); + + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); unsigned long size = PFN_ALIGN(_etext) - start; -#ifndef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel text: %luk\n", size >> 10); + kernel_set_to_readonly = 1; + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", start, start+size); @@ -1174,7 +1204,6 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: write protecting again\n"); set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); #endif -#endif /* CONFIG_DYNAMIC_FTRACE */ start += size; size = (unsigned long)__end_rodata - start; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e6d36b490250..63fdc531601d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -986,21 +986,48 @@ void free_initmem(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); +static int kernel_set_to_readonly; + +void set_kernel_text_rw(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read write\n", + start, end); + + set_memory_rw(start, (end - start) >> PAGE_SHIFT); +} + +void set_kernel_text_ro(void) +{ + unsigned long start = PFN_ALIGN(_stext); + unsigned long end = PFN_ALIGN(__start_rodata); + + if (!kernel_set_to_readonly) + return; + + pr_debug("Set kernel text: %lx - %lx for read only\n", + start, end); + + set_memory_ro(start, (end - start) >> PAGE_SHIFT); +} + void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; -#ifdef CONFIG_DYNAMIC_FTRACE - /* Dynamic tracing modifies the kernel text section */ - start = rodata_start; -#endif - printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); + kernel_set_to_readonly = 1; + /* * The rodata section (but not the kernel text!) should also be * not-executable. -- cgit v1.2.3-59-g8ed1b From 90c7ac49aa819feb9433b5310089fca6399881c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 13:32:57 -0500 Subject: ftrace: immediately stop code modification if failure is detected Impact: fix to prevent NMI lockup If the page fault handler produces a WARN_ON in the modifying of text, and the system is setup to have a high frequency of NMIs, we can lock up the system on a failure to modify code. The modifying of code with NMIs allows all NMIs to modify the code if it is about to run. This prevents a modifier on one CPU from modifying code running in NMI context on another CPU. The modifying is done through stop_machine, so only NMIs must be considered. But if the write causes the page fault handler to produce a warning, the print can slow it down enough that as soon as it is done it will take another NMI before going back to the process context. The new NMI will perform the write again causing another print and this will hang the box. This patch turns off the writing as soon as a failure is detected and does not wait for it to be turned off by the process context. This will keep NMIs from getting stuck in this back and forth of print outs. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 77857d4f7d0f..c56d73894322 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -124,6 +124,10 @@ static void ftrace_mod_code(void) */ mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, MCOUNT_INSN_SIZE); + + /* if we fail, then kill any new writers */ + if (mod_code_status) + mod_code_write = 0; } void ftrace_nmi_enter(void) -- cgit v1.2.3-59-g8ed1b From 4377245aa93b65b6597e4b7bb460fb9abc48b56b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Feb 2009 13:41:27 -0500 Subject: ftrace: break out modify loop immediately on detection of error Impact: added precaution on failure detection Break out of the modifying loop as soon as a failure is detected. This is just an added precaution found by code review and was not found by any bug chasing. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72316d9647bd..11ad796ca049 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -561,8 +561,11 @@ static void ftrace_replace_code(int enable) if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { ftrace_free_rec(rec); - } else + } else { ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; + } } } } -- cgit v1.2.3-59-g8ed1b From b1569e99c795bf83b4ddf41c4f1c42761ab7f75e Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Wed, 3 Dec 2008 17:55:32 +0000 Subject: ACPI: move thermal trip handling to generic thermal layer The ACPI code currently carries its own thermal trip handling, meaning that any other thermal implementation will need to reimplement it. Move the code to the generic thermal layer. Signed-off-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/acpi/thermal.c | 458 +++++------------------------------------- drivers/thermal/thermal_sys.c | 188 ++++++++++++++++- include/linux/thermal.h | 15 +- 3 files changed, 248 insertions(+), 413 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 1c410ef859c6..0ec48d2f85c5 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -190,7 +189,6 @@ struct acpi_thermal { struct acpi_thermal_state state; struct acpi_thermal_trips trips; struct acpi_handle_list devices; - struct timer_list timer; struct thermal_zone_device *thermal_zone; int tz_enabled; struct mutex lock; @@ -290,6 +288,11 @@ static int acpi_thermal_set_polling(struct acpi_thermal *tz, int seconds) tz->polling_frequency = seconds * 10; /* Convert value to deci-seconds */ + tz->thermal_zone->polling_delay = seconds * 1000; + + if (tz->tz_enabled) + thermal_zone_device_update(tz->thermal_zone); + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Polling frequency set to %lu seconds\n", tz->polling_frequency/10)); @@ -569,386 +572,11 @@ static int acpi_thermal_get_trip_points(struct acpi_thermal *tz) return acpi_thermal_trips_update(tz, ACPI_TRIPS_INIT); } -static int acpi_thermal_critical(struct acpi_thermal *tz) -{ - if (!tz || !tz->trips.critical.flags.valid) - return -EINVAL; - - if (tz->temperature >= tz->trips.critical.temperature) { - printk(KERN_WARNING PREFIX "Critical trip point\n"); - tz->trips.critical.flags.enabled = 1; - } else if (tz->trips.critical.flags.enabled) - tz->trips.critical.flags.enabled = 0; - - acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_CRITICAL, - tz->trips.critical.flags.enabled); - acpi_bus_generate_netlink_event(tz->device->pnp.device_class, - dev_name(&tz->device->dev), - ACPI_THERMAL_NOTIFY_CRITICAL, - tz->trips.critical.flags.enabled); - - /* take no action if nocrt is set */ - if(!nocrt) { - printk(KERN_EMERG - "Critical temperature reached (%ld C), shutting down.\n", - KELVIN_TO_CELSIUS(tz->temperature)); - orderly_poweroff(true); - } - - return 0; -} - -static int acpi_thermal_hot(struct acpi_thermal *tz) -{ - if (!tz || !tz->trips.hot.flags.valid) - return -EINVAL; - - if (tz->temperature >= tz->trips.hot.temperature) { - printk(KERN_WARNING PREFIX "Hot trip point\n"); - tz->trips.hot.flags.enabled = 1; - } else if (tz->trips.hot.flags.enabled) - tz->trips.hot.flags.enabled = 0; - - acpi_bus_generate_proc_event(tz->device, ACPI_THERMAL_NOTIFY_HOT, - tz->trips.hot.flags.enabled); - acpi_bus_generate_netlink_event(tz->device->pnp.device_class, - dev_name(&tz->device->dev), - ACPI_THERMAL_NOTIFY_HOT, - tz->trips.hot.flags.enabled); - - /* TBD: Call user-mode "sleep(S4)" function if nocrt is cleared */ - - return 0; -} - -static void acpi_thermal_passive(struct acpi_thermal *tz) -{ - int result = 1; - struct acpi_thermal_passive *passive = NULL; - int trend = 0; - int i = 0; - - - if (!tz || !tz->trips.passive.flags.valid) - return; - - passive = &(tz->trips.passive); - - /* - * Above Trip? - * ----------- - * Calculate the thermal trend (using the passive cooling equation) - * and modify the performance limit for all passive cooling devices - * accordingly. Note that we assume symmetry. - */ - if (tz->temperature >= passive->temperature) { - trend = - (passive->tc1 * (tz->temperature - tz->last_temperature)) + - (passive->tc2 * (tz->temperature - passive->temperature)); - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "trend[%d]=(tc1[%lu]*(tmp[%lu]-last[%lu]))+(tc2[%lu]*(tmp[%lu]-psv[%lu]))\n", - trend, passive->tc1, tz->temperature, - tz->last_temperature, passive->tc2, - tz->temperature, passive->temperature)); - passive->flags.enabled = 1; - /* Heating up? */ - if (trend > 0) - for (i = 0; i < passive->devices.count; i++) - acpi_processor_set_thermal_limit(passive-> - devices. - handles[i], - ACPI_PROCESSOR_LIMIT_INCREMENT); - /* Cooling off? */ - else if (trend < 0) { - for (i = 0; i < passive->devices.count; i++) - /* - * assume that we are on highest - * freq/lowest thrott and can leave - * passive mode, even in error case - */ - if (!acpi_processor_set_thermal_limit - (passive->devices.handles[i], - ACPI_PROCESSOR_LIMIT_DECREMENT)) - result = 0; - /* - * Leave cooling mode, even if the temp might - * higher than trip point This is because some - * machines might have long thermal polling - * frequencies (tsp) defined. We will fall back - * into passive mode in next cycle (probably quicker) - */ - if (result) { - passive->flags.enabled = 0; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Disabling passive cooling, still above threshold," - " but we are cooling down\n")); - } - } - return; - } - - /* - * Below Trip? - * ----------- - * Implement passive cooling hysteresis to slowly increase performance - * and avoid thrashing around the passive trip point. Note that we - * assume symmetry. - */ - if (!passive->flags.enabled) - return; - for (i = 0; i < passive->devices.count; i++) - if (!acpi_processor_set_thermal_limit - (passive->devices.handles[i], - ACPI_PROCESSOR_LIMIT_DECREMENT)) - result = 0; - if (result) { - passive->flags.enabled = 0; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Disabling passive cooling (zone is cool)\n")); - } -} - -static void acpi_thermal_active(struct acpi_thermal *tz) -{ - int result = 0; - struct acpi_thermal_active *active = NULL; - int i = 0; - int j = 0; - unsigned long maxtemp = 0; - - - if (!tz) - return; - - for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) { - active = &(tz->trips.active[i]); - if (!active || !active->flags.valid) - break; - if (tz->temperature >= active->temperature) { - /* - * Above Threshold? - * ---------------- - * If not already enabled, turn ON all cooling devices - * associated with this active threshold. - */ - if (active->temperature > maxtemp) - tz->state.active_index = i; - maxtemp = active->temperature; - if (active->flags.enabled) - continue; - for (j = 0; j < active->devices.count; j++) { - result = - acpi_bus_set_power(active->devices. - handles[j], - ACPI_STATE_D0); - if (result) { - printk(KERN_WARNING PREFIX - "Unable to turn cooling device [%p] 'on'\n", - active->devices. - handles[j]); - continue; - } - active->flags.enabled = 1; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Cooling device [%p] now 'on'\n", - active->devices.handles[j])); - } - continue; - } - if (!active->flags.enabled) - continue; - /* - * Below Threshold? - * ---------------- - * Turn OFF all cooling devices associated with this - * threshold. - */ - for (j = 0; j < active->devices.count; j++) { - result = acpi_bus_set_power(active->devices.handles[j], - ACPI_STATE_D3); - if (result) { - printk(KERN_WARNING PREFIX - "Unable to turn cooling device [%p] 'off'\n", - active->devices.handles[j]); - continue; - } - active->flags.enabled = 0; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Cooling device [%p] now 'off'\n", - active->devices.handles[j])); - } - } -} - -static void acpi_thermal_check(void *context); - -static void acpi_thermal_run(unsigned long data) -{ - struct acpi_thermal *tz = (struct acpi_thermal *)data; - if (!tz->zombie) - acpi_os_execute(OSL_GPE_HANDLER, acpi_thermal_check, (void *)data); -} - -static void acpi_thermal_active_off(void *data) -{ - int result = 0; - struct acpi_thermal *tz = data; - int i = 0; - int j = 0; - struct acpi_thermal_active *active = NULL; - - if (!tz) { - printk(KERN_ERR PREFIX "Invalid (NULL) context\n"); - return; - } - - result = acpi_thermal_get_temperature(tz); - if (result) - return; - - for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) { - active = &(tz->trips.active[i]); - if (!active || !active->flags.valid) - break; - if (tz->temperature >= active->temperature) { - /* - * If the thermal temperature is greater than the - * active threshod, unnecessary to turn off the - * the active cooling device. - */ - continue; - } - /* - * Below Threshold? - * ---------------- - * Turn OFF all cooling devices associated with this - * threshold. - */ - for (j = 0; j < active->devices.count; j++) - result = acpi_bus_set_power(active->devices.handles[j], - ACPI_STATE_D3); - } -} - static void acpi_thermal_check(void *data) { - int result = 0; struct acpi_thermal *tz = data; - unsigned long sleep_time = 0; - unsigned long timeout_jiffies = 0; - int i = 0; - struct acpi_thermal_state state; - - - if (!tz) { - printk(KERN_ERR PREFIX "Invalid (NULL) context\n"); - return; - } - - /* Check if someone else is already running */ - if (!mutex_trylock(&tz->lock)) - return; - - state = tz->state; - - result = acpi_thermal_get_temperature(tz); - if (result) - goto unlock; - - if (!tz->tz_enabled) - goto unlock; - - memset(&tz->state, 0, sizeof(tz->state)); - - /* - * Check Trip Points - * ----------------- - * Compare the current temperature to the trip point values to see - * if we've entered one of the thermal policy states. Note that - * this function determines when a state is entered, but the - * individual policy decides when it is exited (e.g. hysteresis). - */ - if (tz->trips.critical.flags.valid) - state.critical |= - (tz->temperature >= tz->trips.critical.temperature); - if (tz->trips.hot.flags.valid) - state.hot |= (tz->temperature >= tz->trips.hot.temperature); - if (tz->trips.passive.flags.valid) - state.passive |= - (tz->temperature >= tz->trips.passive.temperature); - for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) - if (tz->trips.active[i].flags.valid) - state.active |= - (tz->temperature >= - tz->trips.active[i].temperature); - - /* - * Invoke Policy - * ------------- - * Separated from the above check to allow individual policy to - * determine when to exit a given state. - */ - if (state.critical) - acpi_thermal_critical(tz); - if (state.hot) - acpi_thermal_hot(tz); - if (state.passive) - acpi_thermal_passive(tz); - if (state.active) - acpi_thermal_active(tz); - - /* - * Calculate State - * --------------- - * Again, separated from the above two to allow independent policy - * decisions. - */ - tz->state.critical = tz->trips.critical.flags.enabled; - tz->state.hot = tz->trips.hot.flags.enabled; - tz->state.passive = tz->trips.passive.flags.enabled; - tz->state.active = 0; - for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE; i++) - tz->state.active |= tz->trips.active[i].flags.enabled; - - /* - * Calculate Sleep Time - * -------------------- - * If we're in the passive state, use _TSP's value. Otherwise - * use the default polling frequency (e.g. _TZP). If no polling - * frequency is specified then we'll wait forever (at least until - * a thermal event occurs). Note that _TSP and _TZD values are - * given in 1/10th seconds (we must covert to milliseconds). - */ - if (tz->state.passive) { - sleep_time = tz->trips.passive.tsp * 100; - timeout_jiffies = jiffies + (HZ * sleep_time) / 1000; - } else if (tz->polling_frequency > 0) { - sleep_time = tz->polling_frequency * 100; - timeout_jiffies = round_jiffies(jiffies + (HZ * sleep_time) / 1000); - } - - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "%s: temperature[%lu] sleep[%lu]\n", - tz->name, tz->temperature, sleep_time)); - /* - * Schedule Next Poll - * ------------------ - */ - if (!sleep_time) { - if (timer_pending(&(tz->timer))) - del_timer(&(tz->timer)); - } else { - if (timer_pending(&(tz->timer))) - mod_timer(&(tz->timer), timeout_jiffies); - else { - tz->timer.data = (unsigned long)tz; - tz->timer.function = acpi_thermal_run; - tz->timer.expires = timeout_jiffies; - add_timer(&(tz->timer)); - } - } - unlock: - mutex_unlock(&tz->lock); + thermal_zone_device_update(tz->thermal_zone); } /* sys I/F for generic thermal sysfs support */ @@ -1122,6 +750,29 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal, return -EINVAL; } +static int thermal_notify(struct thermal_zone_device *thermal, int trip, + enum thermal_trip_type trip_type) +{ + u8 type = 0; + struct acpi_thermal *tz = thermal->devdata; + + if (trip_type == THERMAL_TRIP_CRITICAL) + type = ACPI_THERMAL_NOTIFY_CRITICAL; + else if (trip_type == THERMAL_TRIP_HOT) + type = ACPI_THERMAL_NOTIFY_HOT; + else + return 0; + + acpi_bus_generate_proc_event(tz->device, type, 1); + acpi_bus_generate_netlink_event(tz->device->pnp.device_class, + tz->device->dev.bus_id, type, 1); + + if (trip_type == THERMAL_TRIP_CRITICAL && nocrt) + return 1; + + return 0; +} + typedef int (*cb)(struct thermal_zone_device *, int, struct thermal_cooling_device *); static int acpi_thermal_cooling_device_cb(struct thermal_zone_device *thermal, @@ -1214,6 +865,7 @@ static struct thermal_zone_device_ops acpi_thermal_zone_ops = { .get_trip_type = thermal_get_trip_type, .get_trip_temp = thermal_get_trip_temp, .get_crit_temp = thermal_get_crit_temp, + .notify = thermal_notify, }; static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz) @@ -1234,8 +886,21 @@ static int acpi_thermal_register_thermal_zone(struct acpi_thermal *tz) for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++, trips++); - tz->thermal_zone = thermal_zone_device_register("acpitz", - trips, tz, &acpi_thermal_zone_ops); + + if (tz->trips.passive.flags.valid) + tz->thermal_zone = + thermal_zone_device_register("acpitz", trips, tz, + &acpi_thermal_zone_ops, + tz->trips.passive.tc1, + tz->trips.passive.tc2, + tz->trips.passive.tsp*100, + tz->polling_frequency*100); + else + tz->thermal_zone = + thermal_zone_device_register("acpitz", trips, tz, + &acpi_thermal_zone_ops, + 0, 0, 0, + tz->polling_frequency); if (IS_ERR(tz->thermal_zone)) return -ENODEV; @@ -1467,13 +1132,13 @@ static int acpi_thermal_polling_seq_show(struct seq_file *seq, void *offset) if (!tz) goto end; - if (!tz->polling_frequency) { + if (!tz->thermal_zone->polling_delay) { seq_puts(seq, "\n"); goto end; } - seq_printf(seq, "polling frequency: %lu seconds\n", - (tz->polling_frequency / 10)); + seq_printf(seq, "polling frequency: %d seconds\n", + (tz->thermal_zone->polling_delay / 1000)); end: return 0; @@ -1703,12 +1368,6 @@ static int acpi_thermal_add(struct acpi_device *device) if (result) goto unregister_thermal_zone; - init_timer(&tz->timer); - - acpi_thermal_active_off(tz); - - acpi_thermal_check(tz); - status = acpi_install_notify_handler(device->handle, ACPI_DEVICE_NOTIFY, acpi_thermal_notify, tz); @@ -1737,36 +1396,15 @@ static int acpi_thermal_remove(struct acpi_device *device, int type) acpi_status status = AE_OK; struct acpi_thermal *tz = NULL; - if (!device || !acpi_driver_data(device)) return -EINVAL; tz = acpi_driver_data(device); - /* avoid timer adding new defer task */ - tz->zombie = 1; - /* wait for running timer (on other CPUs) finish */ - del_timer_sync(&(tz->timer)); - /* synchronize deferred task */ - acpi_os_wait_events_complete(NULL); - /* deferred task may reinsert timer */ - del_timer_sync(&(tz->timer)); - status = acpi_remove_notify_handler(device->handle, ACPI_DEVICE_NOTIFY, acpi_thermal_notify); - /* Terminate policy */ - if (tz->trips.passive.flags.valid && tz->trips.passive.flags.enabled) { - tz->trips.passive.flags.enabled = 0; - acpi_thermal_passive(tz); - } - if (tz->trips.active[0].flags.valid - && tz->trips.active[0].flags.enabled) { - tz->trips.active[0].flags.enabled = 0; - acpi_thermal_active(tz); - } - acpi_thermal_remove_fs(device); acpi_thermal_unregister_thermal_zone(tz); mutex_destroy(&tz->lock); diff --git a/drivers/thermal/thermal_sys.c b/drivers/thermal/thermal_sys.c index bd139adc6d32..6378741882f3 100644 --- a/drivers/thermal/thermal_sys.c +++ b/drivers/thermal/thermal_sys.c @@ -30,6 +30,7 @@ #include #include #include +#include MODULE_AUTHOR("Zhang Rui"); MODULE_DESCRIPTION("Generic thermal management sysfs support"); @@ -517,6 +518,97 @@ thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz) } #endif +static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, + int delay) +{ + cancel_delayed_work(&(tz->poll_queue)); + + if (!delay) + return; + + if (delay > 1000) + schedule_delayed_work(&(tz->poll_queue), + round_jiffies(msecs_to_jiffies(delay))); + else + schedule_delayed_work(&(tz->poll_queue), + msecs_to_jiffies(delay)); +} + +static void thermal_zone_device_passive(struct thermal_zone_device *tz, + int temp, int trip_temp, int trip) +{ + int trend = 0; + struct thermal_cooling_device_instance *instance; + struct thermal_cooling_device *cdev; + long state, max_state; + + /* + * Above Trip? + * ----------- + * Calculate the thermal trend (using the passive cooling equation) + * and modify the performance limit for all passive cooling devices + * accordingly. Note that we assume symmetry. + */ + if (temp >= trip_temp) { + tz->passive = true; + + trend = (tz->tc1 * (temp - tz->last_temperature)) + + (tz->tc2 * (temp - trip_temp)); + + /* Heating up? */ + if (trend > 0) { + list_for_each_entry(instance, &tz->cooling_devices, + node) { + if (instance->trip != trip) + continue; + cdev = instance->cdev; + cdev->ops->get_cur_state(cdev, &state); + cdev->ops->get_max_state(cdev, &max_state); + if (state++ < max_state) + cdev->ops->set_cur_state(cdev, state); + } + } else if (trend < 0) { /* Cooling off? */ + list_for_each_entry(instance, &tz->cooling_devices, + node) { + if (instance->trip != trip) + continue; + cdev = instance->cdev; + cdev->ops->get_cur_state(cdev, &state); + cdev->ops->get_max_state(cdev, &max_state); + if (state > 0) + cdev->ops->set_cur_state(cdev, --state); + } + } + return; + } + + /* + * Below Trip? + * ----------- + * Implement passive cooling hysteresis to slowly increase performance + * and avoid thrashing around the passive trip point. Note that we + * assume symmetry. + */ + list_for_each_entry(instance, &tz->cooling_devices, node) { + if (instance->trip != trip) + continue; + cdev = instance->cdev; + cdev->ops->get_cur_state(cdev, &state); + cdev->ops->get_max_state(cdev, &max_state); + if (state > 0) + cdev->ops->set_cur_state(cdev, --state); + if (state == 0) + tz->passive = false; + } +} + +static void thermal_zone_device_check(struct work_struct *work) +{ + struct thermal_zone_device *tz = container_of(work, struct + thermal_zone_device, + poll_queue.work); + thermal_zone_device_update(tz); +} /** * thermal_zone_bind_cooling_device - bind a cooling device to a thermal zone @@ -786,21 +878,102 @@ void thermal_cooling_device_unregister(struct EXPORT_SYMBOL(thermal_cooling_device_unregister); +/** + * thermal_zone_device_update - force an update of a thermal zone's state + * @ttz: the thermal zone to update + */ + +void thermal_zone_device_update(struct thermal_zone_device *tz) +{ + int count, ret = 0; + long temp, trip_temp; + enum thermal_trip_type trip_type; + struct thermal_cooling_device_instance *instance; + struct thermal_cooling_device *cdev; + + mutex_lock(&tz->lock); + + tz->ops->get_temp(tz, &temp); + + for (count = 0; count < tz->trips; count++) { + tz->ops->get_trip_type(tz, count, &trip_type); + tz->ops->get_trip_temp(tz, count, &trip_temp); + + switch (trip_type) { + case THERMAL_TRIP_CRITICAL: + if (temp > trip_temp) { + if (tz->ops->notify) + ret = tz->ops->notify(tz, count, + trip_type); + if (!ret) { + printk(KERN_EMERG + "Critical temperature reached (%ld C), shutting down.\n", + temp/1000); + orderly_poweroff(true); + } + } + break; + case THERMAL_TRIP_HOT: + if (temp > trip_temp) + if (tz->ops->notify) + tz->ops->notify(tz, count, trip_type); + break; + case THERMAL_TRIP_ACTIVE: + list_for_each_entry(instance, &tz->cooling_devices, + node) { + if (instance->trip != count) + continue; + + cdev = instance->cdev; + + if (temp > trip_temp) + cdev->ops->set_cur_state(cdev, 1); + else + cdev->ops->set_cur_state(cdev, 0); + } + break; + case THERMAL_TRIP_PASSIVE: + if (temp > trip_temp || tz->passive) + thermal_zone_device_passive(tz, temp, + trip_temp, count); + break; + } + } + tz->last_temperature = temp; + if (tz->passive) + thermal_zone_device_set_polling(tz, tz->passive_delay); + else if (tz->polling_delay) + thermal_zone_device_set_polling(tz, tz->polling_delay); + mutex_unlock(&tz->lock); +} +EXPORT_SYMBOL(thermal_zone_device_update); + /** * thermal_zone_device_register - register a new thermal zone device * @type: the thermal zone device type * @trips: the number of trip points the thermal zone support * @devdata: private device data * @ops: standard thermal zone device callbacks + * @tc1: thermal coefficient 1 for passive calculations + * @tc2: thermal coefficient 2 for passive calculations + * @passive_delay: number of milliseconds to wait between polls when + * performing passive cooling + * @polling_delay: number of milliseconds to wait between polls when checking + * whether trip points have been crossed (0 for interrupt + * driven systems) * * thermal_zone_device_unregister() must be called when the device is no - * longer needed. + * longer needed. The passive cooling formula uses tc1 and tc2 as described in + * section 11.1.5.1 of the ACPI specification 3.0. */ struct thermal_zone_device *thermal_zone_device_register(char *type, int trips, void *devdata, struct thermal_zone_device_ops - *ops) + *ops, int tc1, int + tc2, + int passive_delay, + int polling_delay) { struct thermal_zone_device *tz; struct thermal_cooling_device *pos; @@ -834,6 +1007,11 @@ struct thermal_zone_device *thermal_zone_device_register(char *type, tz->device.class = &thermal_class; tz->devdata = devdata; tz->trips = trips; + tz->tc1 = tc1; + tz->tc2 = tc2; + tz->passive_delay = passive_delay; + tz->polling_delay = polling_delay; + dev_set_name(&tz->device, "thermal_zone%d", tz->id); result = device_register(&tz->device); if (result) { @@ -879,6 +1057,10 @@ struct thermal_zone_device *thermal_zone_device_register(char *type, } mutex_unlock(&thermal_list_lock); + INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check); + + thermal_zone_device_update(tz); + if (!result) return tz; @@ -918,6 +1100,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz) tz->ops->unbind(tz, cdev); mutex_unlock(&thermal_list_lock); + thermal_zone_device_set_polling(tz, 0); + if (tz->type[0]) device_remove_file(&tz->device, &dev_attr_type); device_remove_file(&tz->device, &dev_attr_temp); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 4cb3292fb6e4..a81c61521ba4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -27,6 +27,7 @@ #include #include +#include struct thermal_zone_device; struct thermal_cooling_device; @@ -58,6 +59,8 @@ struct thermal_zone_device_ops { int (*get_trip_temp) (struct thermal_zone_device *, int, unsigned long *); int (*get_crit_temp) (struct thermal_zone_device *, unsigned long *); + int (*notify) (struct thermal_zone_device *, int, + enum thermal_trip_type); }; struct thermal_cooling_device_ops { @@ -104,11 +107,18 @@ struct thermal_zone_device { struct device device; void *devdata; int trips; + int tc1; + int tc2; + int passive_delay; + int polling_delay; + int last_temperature; + bool passive; struct thermal_zone_device_ops *ops; struct list_head cooling_devices; struct idr idr; struct mutex lock; /* protect cooling devices list */ struct list_head node; + struct delayed_work poll_queue; #if defined(CONFIG_THERMAL_HWMON) struct list_head hwmon_node; struct thermal_hwmon_device *hwmon; @@ -120,13 +130,16 @@ struct thermal_zone_device { struct thermal_zone_device *thermal_zone_device_register(char *, int, void *, struct thermal_zone_device_ops - *); + *, int tc1, int tc2, + int passive_freq, + int polling_freq); void thermal_zone_device_unregister(struct thermal_zone_device *); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); +void thermal_zone_device_update(struct thermal_zone_device *); struct thermal_cooling_device *thermal_cooling_device_register(char *, void *, struct thermal_cooling_device_ops -- cgit v1.2.3-59-g8ed1b From 5e01cb695d29619dd551bac7d6aa4ef1dc8ebc95 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 24 Feb 2009 13:55:18 +0100 Subject: x86, ftrace: fix section mismatch in hw-branch-tracer Fix an invalid memory reference problem when cpu hotplug support is disabled and the hw-branch-tracer is set as current tracer. Initializing the tracer calls bts_trace_init() which has already been freed at this time. Reported-by: Frederic Weisbecker Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 3561aace075c..3335e807144b 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -127,20 +127,18 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int __cpuinit bts_trace_init(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { hw_branch_trace = tr; - register_hotcpu_notifier(&bts_hotcpu_notifier); bts_trace_start(tr); return 0; } -static void __cpuinit bts_trace_reset(struct trace_array *tr) +static void bts_trace_reset(struct trace_array *tr) { bts_trace_stop(tr); - unregister_hotcpu_notifier(&bts_hotcpu_notifier); } static void bts_trace_print_header(struct seq_file *m) @@ -299,6 +297,7 @@ struct tracer bts_tracer __read_mostly = __init static int init_bts_trace(void) { + register_hotcpu_notifier(&bts_hotcpu_notifier); return register_tracer(&bts_tracer); } device_initcall(init_bts_trace); -- cgit v1.2.3-59-g8ed1b From 499aa86dcbc3c4daf7d2c59c5c30e1a78220fbc1 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 24 Feb 2009 14:12:34 +0100 Subject: x86, ptrace: remove CONFIG guards around declarations Remove unnecessary CONFIG guards around type declarations and macro definitions. Reported-by: Cyrill Gorcunov Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace-abi.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h index 8e0f8d199e05..86723035a515 100644 --- a/arch/x86/include/asm/ptrace-abi.h +++ b/arch/x86/include/asm/ptrace-abi.h @@ -80,8 +80,6 @@ #define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */ -#ifdef CONFIG_X86_PTRACE_BTS - #ifndef __ASSEMBLY__ #include @@ -140,6 +138,5 @@ struct ptrace_bts_config { BTS records are read from oldest to newest. Returns number of BTS records drained. */ -#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* _ASM_X86_PTRACE_ABI_H */ -- cgit v1.2.3-59-g8ed1b From 7c37730cd31ddb2d3a1da142af9b18c29b8c433b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Feb 2009 12:07:53 -0500 Subject: tracing: add DEFINE_TRACE_FMT to tracepoint.h This patch creates a DEFINE_TRACE_FMT to map to DECLARE_TRACE. This allows for the developers to place format strings and args in with their tracepoint declaration. A tracer may now override the DEFINE_TRACE_FMT macro and use it to record a default format. Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 757005458366..34ae464effff 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -153,4 +153,7 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_sched(); } +#define DEFINE_TRACE_FMT(name, proto, args, fmt) \ + DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args)) + #endif -- cgit v1.2.3-59-g8ed1b From b77e38aa240c3bd9c55c98b9f7c81541e042eae5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Feb 2009 10:21:36 -0500 Subject: tracing: add event trace infrastructure This patch creates the event tracing infrastructure of ftrace. It will create the files: /debug/tracing/available_events /debug/tracing/set_event The available_events will list the trace points that have been registered with the event tracer. set_events will allow the user to enable or disable an event hook. example: # echo sched_wakeup > /debug/tracing/set_event Will enable the sched_wakeup event (if it is registered). # echo "!sched_wakeup" >> /debug/tracing/set_event Will disable the sched_wakeup event (and only that event). # echo > /debug/tracing/set_event Will disable all events (notice the '>') # cat /debug/tracing/available_events > /debug/tracing/set_event Will enable all registered event hooks. Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 11 +- kernel/trace/Kconfig | 9 ++ kernel/trace/Makefile | 1 + kernel/trace/trace_events.c | 280 ++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.h | 52 +++++++ 5 files changed, 352 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/trace_events.c create mode 100644 kernel/trace/trace_events.h diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c61fab1dd2f8..0add6b28c366 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -61,6 +61,14 @@ #define BRANCH_PROFILE() #endif +#ifdef CONFIG_EVENT_TRACER +#define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ + *(_ftrace_events) \ + VMLINUX_SYMBOL(__stop_ftrace_events) = .; +#else +#define FTRACE_EVENTS() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -81,7 +89,8 @@ *(__tracepoints) \ VMLINUX_SYMBOL(__stop___tracepoints) = .; \ LIKELY_PROFILE() \ - BRANCH_PROFILE() + BRANCH_PROFILE() \ + FTRACE_EVENTS() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 07877f4b5233..999c6a2485df 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -159,6 +159,15 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. +config EVENT_TRACER + bool "Trace various events in the kernel" + depends on DEBUG_KERNEL + select TRACING + help + This tracer hooks to various trace points in the kernel + allowing the user to pick and choose which trace point they + want to trace. + config BOOT_TRACER bool "Trace boot initcalls" depends on DEBUG_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 627090bc262d..c7363568b1cf 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -38,5 +38,6 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_EVENT_TRACER) += trace_events.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c new file mode 100644 index 000000000000..05bc80ec8d2c --- /dev/null +++ b/kernel/trace/trace_events.c @@ -0,0 +1,280 @@ +/* + * event tracer + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt + * + */ + +#include +#include +#include +#include + +#include "trace_events.h" + +void event_trace_printk(unsigned long ip, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + tracing_record_cmdline(current); + trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); +} + +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call = (void *)__start_ftrace_events; + + + while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + call++; + } +} + +static int ftrace_set_clr_event(char *buf, int set) +{ + struct ftrace_event_call *call = (void *)__start_ftrace_events; + + + while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + + if (strcmp(buf, call->name) != 0) { + call++; + continue; + } + + if (set) { + /* Already set? */ + if (call->enabled) + return 0; + call->enabled = 1; + call->regfunc(); + } else { + /* Already cleared? */ + if (!call->enabled) + return 0; + call->enabled = 0; + call->unregfunc(); + } + return 0; + } + return -EINVAL; +} + +/* 128 should be much more than enough */ +#define EVENT_BUF_SIZE 127 + +static ssize_t +ftrace_event_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + size_t read = 0; + int i, set = 1; + ssize_t ret; + char *buf; + char ch; + + if (!cnt || cnt < 0) + return 0; + + ret = get_user(ch, ubuf++); + if (ret) + return ret; + read++; + cnt--; + + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + return ret; + read++; + cnt--; + } + + /* Only white space found? */ + if (isspace(ch)) { + file->f_pos += read; + ret = read; + return ret; + } + + buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (cnt > EVENT_BUF_SIZE) + cnt = EVENT_BUF_SIZE; + + i = 0; + while (cnt && !isspace(ch)) { + if (!i && ch == '!') + set = 0; + else + buf[i++] = ch; + + ret = get_user(ch, ubuf++); + if (ret) + goto out_free; + read++; + cnt--; + } + buf[i] = 0; + + file->f_pos += read; + + ret = ftrace_set_clr_event(buf, set); + if (ret) + goto out_free; + + ret = read; + + out_free: + kfree(buf); + + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_call *next = call; + + (*pos)++; + + if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + return NULL; + + m->private = ++next; + + return call; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + return t_next(m, NULL, pos); +} + +static void * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_call *next; + + (*pos)++; + + retry: + if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + return NULL; + + if (!call->enabled) { + call++; + goto retry; + } + + next = call; + m->private = ++next; + + return call; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + return s_next(m, NULL, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = v; + + seq_printf(m, "%s\n", call->name); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static int +ftrace_event_seq_open(struct inode *inode, struct file *file) +{ + int ret; + const struct seq_operations *seq_ops; + + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) + ftrace_clear_events(); + + seq_ops = inode->i_private; + ret = seq_open(file, seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = __start_ftrace_events; + } + return ret; +} + +static const struct seq_operations show_event_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_event_seq_ops = { + .start = s_start, + .next = s_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_set_event_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .write = ftrace_event_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int event_trace_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("available_events", 0444, d_tracer, + (void *)&show_event_seq_ops, + &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_events' entry\n"); + + entry = debugfs_create_file("set_event", 0644, d_tracer, + (void *)&show_set_event_seq_ops, + &ftrace_set_event_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_event' entry\n"); + + return 0; +} +fs_initcall(event_trace_init); diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h new file mode 100644 index 000000000000..39342f86db27 --- /dev/null +++ b/kernel/trace/trace_events.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_KERNEL_TRACE_EVENTS_H +#define _LINUX_KERNEL_TRACE_EVENTS_H + +#include +#include "trace.h" + +struct ftrace_event_call { + char *name; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); +}; + + +#undef TPFMT +#define TPFMT(fmt, args...) fmt "\n", ##args + +#undef DEFINE_TRACE_FMT +#define DEFINE_TRACE_FMT(call, proto, args, fmt) \ +static void ftrace_event_##call(proto) \ +{ \ + event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ +} \ + \ +static int ftrace_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_event_##call); \ + if (!ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call); \ + return ret; \ +} \ + \ +static void ftrace_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_event_##call); \ +} \ + \ +static struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .regfunc = ftrace_reg_event_##call, \ + .unregfunc = ftrace_unreg_event_##call, \ +} + +void event_trace_printk(unsigned long ip, const char *fmt, ...); +extern unsigned long __start_ftrace_events[]; +extern unsigned long __stop_ftrace_events[]; + +#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */ -- cgit v1.2.3-59-g8ed1b From f3fe8e4a38fd19dbb3f8ffb1826aa840ae304a65 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Feb 2009 10:22:57 -0500 Subject: tracing: add schedule events to event trace This patch changes the trace/sched.h to use the DECLARE_TRACE_FMT such that they are automatically registered with the event tracer. And it also adds the tracing sched headers to kernel/trace/events.c Signed-off-by: Steven Rostedt --- include/trace/sched.h | 49 +------------------------- include/trace/sched_event_types.h | 72 +++++++++++++++++++++++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/events.c | 13 +++++++ 4 files changed, 87 insertions(+), 48 deletions(-) create mode 100644 include/trace/sched_event_types.h create mode 100644 kernel/trace/events.c diff --git a/include/trace/sched.h b/include/trace/sched.h index 0d81098ee9fc..4e372a1a29bf 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -4,53 +4,6 @@ #include #include -DECLARE_TRACE(sched_kthread_stop, - TPPROTO(struct task_struct *t), - TPARGS(t)); - -DECLARE_TRACE(sched_kthread_stop_ret, - TPPROTO(int ret), - TPARGS(ret)); - -DECLARE_TRACE(sched_wait_task, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p)); - -DECLARE_TRACE(sched_wakeup, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success)); - -DECLARE_TRACE(sched_wakeup_new, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success)); - -DECLARE_TRACE(sched_switch, - TPPROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), - TPARGS(rq, prev, next)); - -DECLARE_TRACE(sched_migrate_task, - TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - TPARGS(p, orig_cpu, dest_cpu)); - -DECLARE_TRACE(sched_process_free, - TPPROTO(struct task_struct *p), - TPARGS(p)); - -DECLARE_TRACE(sched_process_exit, - TPPROTO(struct task_struct *p), - TPARGS(p)); - -DECLARE_TRACE(sched_process_wait, - TPPROTO(struct pid *pid), - TPARGS(pid)); - -DECLARE_TRACE(sched_process_fork, - TPPROTO(struct task_struct *parent, struct task_struct *child), - TPARGS(parent, child)); - -DECLARE_TRACE(sched_signal_send, - TPPROTO(int sig, struct task_struct *p), - TPARGS(sig, p)); +#include #endif diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h new file mode 100644 index 000000000000..a4f662940f4e --- /dev/null +++ b/include/trace/sched_event_types.h @@ -0,0 +1,72 @@ + +/* use instead */ +#ifndef DEFINE_TRACE_FMT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +DEFINE_TRACE_FMT(sched_kthread_stop, + TPPROTO(struct task_struct *t), + TPARGS(t), + TPFMT("task %s:%d", t->comm, t->pid)); + +DEFINE_TRACE_FMT(sched_kthread_stop_ret, + TPPROTO(int ret), + TPARGS(ret), + TPFMT("ret=%d", ret)); + +DEFINE_TRACE_FMT(sched_wait_task, + TPPROTO(struct rq *rq, struct task_struct *p), + TPARGS(rq, p), + TPFMT("task %s:%d", p->comm, p->pid)); + +DEFINE_TRACE_FMT(sched_wakeup, + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success), + TPFMT("task %s:%d %s", + p->comm, p->pid, success?"succeeded":"failed")); + +DEFINE_TRACE_FMT(sched_wakeup_new, + TPPROTO(struct rq *rq, struct task_struct *p, int success), + TPARGS(rq, p, success), + TPFMT("task %s:%d", + p->comm, p->pid, success?"succeeded":"failed")); + +DEFINE_TRACE_FMT(sched_switch, + TPPROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + TPARGS(rq, prev, next), + TPFMT("task %s:%d ==> %s:%d", + prev->comm, prev->pid, next->comm, next->pid)); + +DEFINE_TRACE_FMT(sched_migrate_task, + TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TPARGS(p, orig_cpu, dest_cpu), + TPFMT("task %s:%d from: %d to: %d", + p->comm, p->pid, orig_cpu, dest_cpu)); + +DEFINE_TRACE_FMT(sched_process_free, + TPPROTO(struct task_struct *p), + TPARGS(p), + TPFMT("task %s:%d", p->comm, p->pid)); + +DEFINE_TRACE_FMT(sched_process_exit, + TPPROTO(struct task_struct *p), + TPARGS(p), + TPFMT("task %s:%d", p->comm, p->pid)); + +DEFINE_TRACE_FMT(sched_process_wait, + TPPROTO(struct pid *pid), + TPARGS(pid), + TPFMT("pid %d", pid)); + +DEFINE_TRACE_FMT(sched_process_fork, + TPPROTO(struct task_struct *parent, struct task_struct *child), + TPARGS(parent, child), + TPFMT("parent %s:%d child %s:%d", + parent->comm, parent->pid, child->comm, child->pid)); + +DEFINE_TRACE_FMT(sched_signal_send, + TPPROTO(int sig, struct task_struct *p), + TPARGS(sig, p), + TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid)); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c7363568b1cf..664b6c0dc75a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -39,5 +39,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACER) += trace_events.o +obj-$(CONFIG_EVENT_TRACER) += events.o libftrace-y := ftrace.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c new file mode 100644 index 000000000000..38c89eef99ee --- /dev/null +++ b/kernel/trace/events.c @@ -0,0 +1,13 @@ +/* + * This is the place to register all trace points as events. + * Include the trace/.h at the top. + * Include the trace/_event_types.h at the bottom. + */ + +/* trace/.h here */ +#include + +#include "trace_events.h" + +/* trace/_event_types.h here */ +#include -- cgit v1.2.3-59-g8ed1b From 1473e4417c79f12d91ef91a469699bfa911f510f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Feb 2009 14:15:08 -0500 Subject: tracing: make event directory structure This patch adds the directory /debug/tracing/events/ that will contain all the registered trace points. # ls /debug/tracing/events/ sched_kthread_stop sched_process_fork sched_switch sched_kthread_stop_ret sched_process_free sched_wait_task sched_migrate_task sched_process_wait sched_wakeup sched_process_exit sched_signal_send sched_wakeup_new # ls /debug/tracing/events/sched_switch/ enable # cat /debug/tracing/events/sched_switch/enable 1 # cat /debug/tracing/set_event sched_switch Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 137 ++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace_events.h | 7 ++- 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05bc80ec8d2c..3bcb9df93342 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -12,6 +12,11 @@ #include "trace_events.h" +#define events_for_each(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + void event_trace_printk(unsigned long ip, const char *fmt, ...) { va_list ap; @@ -39,15 +44,16 @@ static void ftrace_clear_events(void) static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; + struct ftrace_event_call *call = __start_ftrace_events; - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + events_for_each(call) { - if (strcmp(buf, call->name) != 0) { - call++; + if (!call->name) + continue; + + if (strcmp(buf, call->name) != 0) continue; - } if (set) { /* Already set? */ @@ -223,6 +229,67 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) return ret; } +static ssize_t +event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char *buf; + + if (call->enabled) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + if (!call->enabled) + break; + + call->enabled = 0; + call->unregfunc(); + break; + case 1: + if (call->enabled) + break; + + call->enabled = 1; + call->regfunc(); + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -252,10 +319,59 @@ static const struct file_operations ftrace_set_event_fops = { .release = seq_release, }; +static const struct file_operations ftrace_enable_fops = { + .open = tracing_open_generic, + .read = event_enable_read, + .write = event_enable_write, +}; + +static struct dentry *event_trace_events_dir(void) +{ + static struct dentry *d_tracer; + static struct dentry *d_events; + + if (d_events) + return d_events; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return NULL; + + d_events = debugfs_create_dir("events", d_tracer); + if (!d_events) + pr_warning("Could not create debugfs " + "'events' directory\n"); + + return d_events; +} + +static int +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +{ + struct dentry *entry; + + call->dir = debugfs_create_dir(call->name, d_events); + if (!call->dir) { + pr_warning("Could not create debugfs " + "'%s' directory\n", call->name); + return -1; + } + + entry = debugfs_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/enable' entry\n", call->name); + + return 0; +} + static __init int event_trace_init(void) { + struct ftrace_event_call *call = __start_ftrace_events; struct dentry *d_tracer; struct dentry *entry; + struct dentry *d_events; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -275,6 +391,17 @@ static __init int event_trace_init(void) pr_warning("Could not create debugfs " "'set_event' entry\n"); + d_events = event_trace_events_dir(); + if (!d_events) + return 0; + + events_for_each(call) { + /* The linker may leave blanks */ + if (!call->name) + continue; + event_create_dir(call, d_events); + } + return 0; } fs_initcall(event_trace_init); diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h index 39342f86db27..cb8455b3ac9a 100644 --- a/kernel/trace/trace_events.h +++ b/kernel/trace/trace_events.h @@ -1,11 +1,13 @@ #ifndef _LINUX_KERNEL_TRACE_EVENTS_H #define _LINUX_KERNEL_TRACE_EVENTS_H +#include #include #include "trace.h" struct ftrace_event_call { char *name; + struct dentry *dir; int enabled; int (*regfunc)(void); void (*unregfunc)(void); @@ -39,6 +41,7 @@ static void ftrace_unreg_event_##call(void) \ } \ \ static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .regfunc = ftrace_reg_event_##call, \ @@ -46,7 +49,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ } void event_trace_printk(unsigned long ip, const char *fmt, ...); -extern unsigned long __start_ftrace_events[]; -extern unsigned long __stop_ftrace_events[]; +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; #endif /* _LINUX_KERNEL_TRACE_EVENTS_H */ -- cgit v1.2.3-59-g8ed1b From 2d542cf34264ac92e9e7ac55c0b096b066d569d2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 08:40:09 +0100 Subject: tracing/hw-branch-tracing: convert bts-tracer mutex to a spinlock Impact: fix CPU hotplug lockup bts_hotcpu_handler() is called with irqs disabled, so using mutex_lock() is a no-no. All the BTS codepaths here are atomic (they do not schedule), so using a spinlock is the right solution. Cc: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 57 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 3335e807144b..7bfdf4c2347f 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -3,17 +3,15 @@ * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 - * */ - -#include -#include +#include +#include #include #include -#include -#include +#include #include #include +#include #include @@ -23,16 +21,17 @@ #define SIZEOF_BTS (1 << 13) -/* The tracer mutex protects the below per-cpu tracer array. - It needs to be held to: - - start tracing on all cpus - - stop tracing on all cpus - - start tracing on a single hotplug cpu - - stop tracing on a single hotplug cpu - - read the trace from all cpus - - read the trace from a single cpu -*/ -static DEFINE_MUTEX(bts_tracer_mutex); +/* + * The tracer lock protects the below per-cpu tracer array. + * It needs to be held to: + * - start tracing on all cpus + * - stop tracing on all cpus + * - start tracing on a single hotplug cpu + * - stop tracing on a single hotplug cpu + * - read the trace from all cpus + * - read the trace from a single cpu + */ +static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); @@ -47,7 +46,7 @@ static struct trace_array *hw_branch_trace __read_mostly; * Start tracing on the current cpu. * The argument is ignored. * - * pre: bts_tracer_mutex must be locked. + * pre: bts_tracer_lock must be locked. */ static void bts_trace_start_cpu(void *arg) { @@ -66,19 +65,19 @@ static void bts_trace_start_cpu(void *arg) static void bts_trace_start(struct trace_array *tr) { - mutex_lock(&bts_tracer_mutex); + spin_lock(&bts_tracer_lock); on_each_cpu(bts_trace_start_cpu, NULL, 1); trace_hw_branches_enabled = 1; - mutex_unlock(&bts_tracer_mutex); + spin_unlock(&bts_tracer_lock); } /* * Stop tracing on the current cpu. * The argument is ignored. * - * pre: bts_tracer_mutex must be locked. + * pre: bts_tracer_lock must be locked. */ static void bts_trace_stop_cpu(void *arg) { @@ -90,12 +89,12 @@ static void bts_trace_stop_cpu(void *arg) static void bts_trace_stop(struct trace_array *tr) { - mutex_lock(&bts_tracer_mutex); + spin_lock(&bts_tracer_lock); trace_hw_branches_enabled = 0; on_each_cpu(bts_trace_stop_cpu, NULL, 1); - mutex_unlock(&bts_tracer_mutex); + spin_unlock(&bts_tracer_lock); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, @@ -103,7 +102,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, { unsigned int cpu = (unsigned long)hcpu; - mutex_lock(&bts_tracer_mutex); + spin_lock(&bts_tracer_lock); if (!trace_hw_branches_enabled) goto out; @@ -119,7 +118,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, } out: - mutex_unlock(&bts_tracer_mutex); + spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -225,7 +224,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_mutex must be locked + * pre: bts_tracer_lock must be locked */ static void trace_bts_cpu(void *arg) { @@ -261,11 +260,11 @@ out: static void trace_bts_prepare(struct trace_iterator *iter) { - mutex_lock(&bts_tracer_mutex); + spin_lock(&bts_tracer_lock); on_each_cpu(trace_bts_cpu, iter->tr, 1); - mutex_unlock(&bts_tracer_mutex); + spin_unlock(&bts_tracer_lock); } static void trace_bts_close(struct trace_iterator *iter) @@ -275,11 +274,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - mutex_lock(&bts_tracer_mutex); + spin_lock(&bts_tracer_lock); trace_bts_cpu(hw_branch_trace); - mutex_unlock(&bts_tracer_mutex); + spin_unlock(&bts_tracer_lock); } struct tracer bts_tracer __read_mostly = -- cgit v1.2.3-59-g8ed1b From 886b5b73d71e4027d7dc6c14f5f7ab102201ea6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 11:03:44 +0100 Subject: tracing: remove /debug/tracing/latency_trace Impact: remove old debug/tracing API /debug/tracing/latency_trace is an old legacy format we kept from the old latency tracer. Remove the file for now. If there's any useful bit missing then we'll propagate any useful output bits into the /debug/tracing/trace output. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e1f3b99a2e52..11ba100f9a9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2938,11 +2938,6 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - entry = debugfs_create_file("latency_trace", 0444, d_tracer, - &global_trace, &tracing_lt_fops); - if (!entry) - pr_warning("Could not create debugfs 'latency_trace' entry\n"); - entry = debugfs_create_file("trace", 0444, d_tracer, &global_trace, &tracing_fops); if (!entry) -- cgit v1.2.3-59-g8ed1b From 15d0d3b3371227f846b9f644547fde081c7e1c0c Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 25 Feb 2009 06:22:45 +0100 Subject: generic IPI: simplify barriers and locking Simplify the barriers in generic remote function call interrupt code. Firstly, just unconditionally take the lock and check the list in the generic_call_function_single_interrupt IPI handler. As we've just taken an IPI here, the chances are fairly high that there will be work on the list for us, so do the locking unconditionally. This removes the tricky lockless list_empty check and dubious barriers. The change looks bigger than it is because it is just removing an outer loop. Secondly, clarify architecture specific IPI locking rules. Generic code has no tools to impose any sane ordering on IPIs if they go outside normal cache coherency, ergo the arch code must make them appear to obey cache coherency as a "memory operation" to initiate an IPI, and a "memory operation" to receive one. This way at least they can be reasoned about in generic code, and smp_mb used to provide ordering. The combination of these two changes means that explict barriers can be taken out of queue handling for the single case -- shared data is explicitly locked, and ipi ordering must conform to that, so no barriers needed. An extra barrier is needed in the many handler, so as to ensure we load the list element after the IPI is received. Does any architecture actually *need* these barriers? For the initiator I could see it, but for the handler I would be surprised. So the other thing we could do for simplicity is just to require that, rather than just matching with cache coherency, we just require a full barrier before generating an IPI, and after receiving an IPI. In which case, the smp_mb()s can go away. But just for now, we'll be on the safe side and use the barriers (they're in the slow case anyway). Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: Andrew Morton Cc: Linus Torvalds Cc: Jens Axboe Cc: Oleg Nesterov Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- kernel/smp.c | 83 ++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index bbedbb7efe32..6ecf4b9895d4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -74,9 +74,16 @@ static void generic_exec_single(int cpu, struct call_single_data *data) spin_unlock_irqrestore(&dst->lock, flags); /* - * Make the list addition visible before sending the ipi. + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really equipped + * to do the right thing... */ - smp_mb(); if (ipi) arch_send_call_function_single_ipi(cpu); @@ -103,6 +110,14 @@ void generic_smp_call_function_interrupt(void) struct call_function_data *data; int cpu = get_cpu(); + /* + * Ensure entry is visible on call_function_queue after we have + * entered the IPI. See comment in smp_call_function_many. + * If we don't have this, then we may miss an entry on the list + * and never get another IPI to process it. + */ + smp_mb(); + /* * It's ok to use list_for_each_rcu() here even though we may delete * 'pos', since list_del_rcu() doesn't clear ->next @@ -154,49 +169,37 @@ void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); LIST_HEAD(list); + unsigned int data_flags; - /* - * Need to see other stores to list head for checking whether - * list is empty without holding q->lock - */ - smp_read_barrier_depends(); - while (!list_empty(&q->list)) { - unsigned int data_flags; - - spin_lock(&q->lock); - list_replace_init(&q->list, &list); - spin_unlock(&q->lock); + spin_lock(&q->lock); + list_replace_init(&q->list, &list); + spin_unlock(&q->lock); - while (!list_empty(&list)) { - struct call_single_data *data; + while (!list_empty(&list)) { + struct call_single_data *data; - data = list_entry(list.next, struct call_single_data, - list); - list_del(&data->list); + data = list_entry(list.next, struct call_single_data, + list); + list_del(&data->list); - /* - * 'data' can be invalid after this call if - * flags == 0 (when called through - * generic_exec_single(), so save them away before - * making the call. - */ - data_flags = data->flags; - - data->func(data->info); - - if (data_flags & CSD_FLAG_WAIT) { - smp_wmb(); - data->flags &= ~CSD_FLAG_WAIT; - } else if (data_flags & CSD_FLAG_LOCK) { - smp_wmb(); - data->flags &= ~CSD_FLAG_LOCK; - } else if (data_flags & CSD_FLAG_ALLOC) - kfree(data); - } /* - * See comment on outer loop + * 'data' can be invalid after this call if + * flags == 0 (when called through + * generic_exec_single(), so save them away before + * making the call. */ - smp_read_barrier_depends(); + data_flags = data->flags; + + data->func(data->info); + + if (data_flags & CSD_FLAG_WAIT) { + smp_wmb(); + data->flags &= ~CSD_FLAG_WAIT; + } else if (data_flags & CSD_FLAG_LOCK) { + smp_wmb(); + data->flags &= ~CSD_FLAG_LOCK; + } else if (data_flags & CSD_FLAG_ALLOC) + kfree(data); } } @@ -375,6 +378,8 @@ void smp_call_function_many(const struct cpumask *mask, /* * Make the list addition visible before sending the ipi. + * (IPIs must obey or appear to obey normal Linux cache coherency + * rules -- see comment in generic_exec_single). */ smp_mb(); -- cgit v1.2.3-59-g8ed1b From b04cc6b1f6398b0e0b60d37e27ce51b4899672ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 25 Feb 2009 03:22:28 +0100 Subject: tracing/core: introduce per cpu tracing files Impact: split up tracing output per cpu Currently, on the tracing debugfs directory, three files are available to the user to let him extracting the trace output: - trace is an iterator through the ring-buffer. It's a reader but not a consumer It doesn't block when no more traces are available. - trace pretty similar to the former, except that it adds more informations such as prempt count, irq flag, ... - trace_pipe is a reader and a consumer, it will also block waiting for traces if necessary (heh, yes it's a pipe). The traces coming from different cpus are curretly mixed up inside these files. Sometimes it messes up the informations, sometimes it's useful, depending on what does the tracer capture. The tracing_cpumask file is useful to filter the output and select only the traces captured a custom defined set of cpus. But still it is not enough powerful to extract at the same time one trace buffer per cpu. So this patch creates a new directory: /debug/tracing/per_cpu/. Inside this directory, you will now find one trace_pipe file and one trace file per cpu. Which means if you have two cpus, you will have: trace0 trace1 trace_pipe0 trace_pipe1 And of course, reading these files will have the same effect than with the usual tracing files, except that you will only see the traces from the given cpu. The original all-in-one cpu trace file are still available on their original place. Until now, only one consumer was allowed on trace_pipe to avoid racy consuming on the ring-buffer. Now the approach changed a bit, you can have only one consumer per cpu. Which means you are allowed to read concurrently trace_pipe0 and trace_pipe1 But you can't have two readers on trace_pipe0 or trace_pipe1. Following the same logic, if there is one reader on the common trace_pipe, you can not have at the same time another reader on trace_pipe0 or in trace_pipe1. Because in trace_pipe is already a consumer in all cpu buffers in essence. Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 168 +++++++++++++++++++++++++++++++++++++++++++-------- kernel/trace/trace.h | 3 + 2 files changed, 147 insertions(+), 24 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 11ba100f9a9e..aa58b7bc847c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -98,6 +98,9 @@ static inline void ftrace_enable_cpu(void) static cpumask_var_t __read_mostly tracing_buffer_mask; +/* Define which cpu buffers are currently read in trace_pipe */ +static cpumask_var_t tracing_reader_cpumask; + #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) @@ -1195,10 +1198,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; int cpu; + /* + * If we are in a per_cpu trace file, don't bother by iterating over + * all cpu and peek directly. + */ + if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (ring_buffer_empty_cpu(buffer, cpu_file)) + return NULL; + ent = peek_next_entry(iter, cpu_file, ent_ts); + if (ent_cpu) + *ent_cpu = cpu_file; + + return ent; + } + for_each_tracing_cpu(cpu) { if (ring_buffer_empty_cpu(buffer, cpu)) @@ -1279,6 +1297,7 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; + int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; @@ -1299,9 +1318,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) ftrace_disable_cpu(); - for_each_tracing_cpu(cpu) { - ring_buffer_iter_reset(iter->buffer_iter[cpu]); - } + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) + ring_buffer_iter_reset(iter->buffer_iter[cpu]); + } else + ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); + ftrace_enable_cpu(); @@ -1653,6 +1675,7 @@ static struct seq_operations tracer_seq_ops = { static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { + long cpu_file = (long) inode->i_private; struct trace_iterator *iter; struct seq_file *m; int cpu; @@ -1672,9 +1695,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) if (current_trace && current_trace->print_max) iter->tr = &max_tr; else - iter->tr = inode->i_private; + iter->tr = &global_trace; iter->trace = current_trace; iter->pos = -1; + iter->cpu_file = cpu_file; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) @@ -1684,14 +1708,22 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) if (ring_buffer_overruns(iter->tr->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { - for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_start(iter->tr->buffer, cpu); + if (!iter->buffer_iter[cpu]) + goto fail_buffer; + } + } else { + cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_start(iter->tr->buffer, cpu); + ring_buffer_read_start(iter->tr->buffer, cpu); if (!iter->buffer_iter[cpu]) - goto fail_buffer; + goto fail; } /* TODO stop tracer */ @@ -1715,6 +1747,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } +fail: mutex_unlock(&trace_types_lock); kfree(iter); @@ -2325,54 +2358,77 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return cnt; } -static atomic_t tracing_reader; - static int tracing_open_pipe(struct inode *inode, struct file *filp) { + long cpu_file = (long) inode->i_private; struct trace_iterator *iter; + int ret = 0; if (tracing_disabled) return -ENODEV; - /* We only allow for reader of the pipe */ - if (atomic_inc_return(&tracing_reader) != 1) { - atomic_dec(&tracing_reader); - return -EBUSY; + mutex_lock(&trace_types_lock); + + /* We only allow one reader per cpu */ + if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (!cpumask_empty(tracing_reader_cpumask)) { + ret = -EBUSY; + goto out; + } + cpumask_setall(tracing_reader_cpumask); + } else { + if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) + cpumask_set_cpu(cpu_file, tracing_reader_cpumask); + else { + ret = -EBUSY; + goto out; + } } /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; + if (!iter) { + ret = -ENOMEM; + goto out; + } if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { kfree(iter); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - mutex_lock(&trace_types_lock); - /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); + iter->cpu_file = cpu_file; iter->tr = &global_trace; iter->trace = current_trace; filp->private_data = iter; if (iter->trace->pipe_open) iter->trace->pipe_open(iter); - mutex_unlock(&trace_types_lock); - return 0; +out: + mutex_unlock(&trace_types_lock); + return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) { struct trace_iterator *iter = file->private_data; + mutex_lock(&trace_types_lock); + + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) + cpumask_clear(tracing_reader_cpumask); + else + cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); + + mutex_unlock(&trace_types_lock); + free_cpumask_var(iter->started); kfree(iter); - atomic_dec(&tracing_reader); return 0; } @@ -2911,6 +2967,59 @@ struct dentry *tracing_init_dentry(void) return d_tracer; } +static struct dentry *d_percpu; + +struct dentry *tracing_dentry_percpu(void) +{ + static int once; + struct dentry *d_tracer; + + if (d_percpu) + return d_percpu; + + d_tracer = tracing_init_dentry(); + + if (!d_tracer) + return NULL; + + d_percpu = debugfs_create_dir("per_cpu", d_tracer); + + if (!d_percpu && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'per_cpu'\n"); + return NULL; + } + + return d_percpu; +} + +static void tracing_init_debugfs_percpu(long cpu) +{ + struct dentry *d_percpu = tracing_dentry_percpu(); + struct dentry *entry; + /* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */ + char filename[17]; + + if (cpu > 999 || cpu < 0) + return; + + /* per cpu trace_pipe */ + sprintf(filename, "trace_pipe%ld", cpu); + + entry = debugfs_create_file(filename, 0444, d_percpu, + (void *) cpu, &tracing_pipe_fops); + if (!entry) + pr_warning("Could not create debugfs '%s' entry\n", filename); + + /* per cpu trace */ + sprintf(filename, "trace%ld", cpu); + + entry = debugfs_create_file(filename, 0444, d_percpu, + (void *) cpu, &tracing_fops); + if (!entry) + pr_warning("Could not create debugfs '%s' entry\n", filename); +} + #ifdef CONFIG_FTRACE_SELFTEST /* Let selftest have access to static functions in this file */ #include "trace_selftest.c" @@ -2920,6 +3029,7 @@ static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; struct dentry *entry; + int cpu; d_tracer = tracing_init_dentry(); @@ -2939,7 +3049,7 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); entry = debugfs_create_file("trace", 0444, d_tracer, - &global_trace, &tracing_fops); + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); @@ -2970,8 +3080,8 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'README' entry\n"); - entry = debugfs_create_file("trace_pipe", 0644, d_tracer, - NULL, &tracing_pipe_fops); + entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); if (!entry) pr_warning("Could not create debugfs " "'trace_pipe' entry\n"); @@ -2999,6 +3109,10 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(cpu); + return 0; } @@ -3222,8 +3336,12 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; + if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + goto out_free_tracing_cpumask; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); + cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(trace_buf_size, @@ -3272,6 +3390,8 @@ __init static int tracer_alloc_buffers(void) ret = 0; out_free_cpumask: + free_cpumask_var(tracing_reader_cpumask); +out_free_tracing_cpumask: free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index eed732c151fc..508235a39da6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -395,6 +395,8 @@ struct trace_seq { unsigned int readpos; }; +#define TRACE_PIPE_ALL_CPU -1 + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: @@ -404,6 +406,7 @@ struct trace_iterator { struct tracer *trace; void *private; struct ring_buffer_iter *buffer_iter[NR_CPUS]; + int cpu_file; /* The below is zeroed out in pipe_read */ struct trace_seq seq; -- cgit v1.2.3-59-g8ed1b From d7350c3f45694104e820041969c8185c5f99e57c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 25 Feb 2009 06:13:16 +0100 Subject: tracing/core: make the read callbacks reentrants Now that several per-cpu files can be read or spliced at the same, we want the read/splice callbacks for tracing files to be reentrants. Until now, a single global mutex (trace_types_lock) serialized the access to tracing_read_pipe(), tracing_splice_read_pipe(), and the seq helpers. Ie: it means that if a user tries to read trace_pipe0 and trace_pipe1 at the same time, the access to the function tracing_read_pipe() is contended and one reader must wait for the other to finish its read call. The trace_type_lock mutex is mostly here to serialize the access to the global current tracer (current_trace), which can be changed concurrently. Although the iter struct keeps a private pointer to this tracer, its callbacks can be changed by another function. The method used here is to not keep anymore private reference to the tracer inside the iterator but to make a copy of it inside the iterator. Then it checks on subsequents read calls if the tracer has changed. This is not costly because the current tracer is not expected to be changed often, so we use a branch prediction for that. Moreover, we add a private mutex to the iterator (there is one iterator per file descriptor) to serialize the accesses in case of multiple consumers per file descriptor (which would be a silly idea from the user). Note that this is not to protect the ring buffer, since the ring buffer already serializes the readers accesses. This is to prevent from traces weirdness in case of concurrent consumers. But these mutexes can be dropped anyway, that would not result in any crash. Just tell me what you think about it. Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.h | 3 +- 2 files changed, 85 insertions(+), 19 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aa58b7bc847c..d8d899f3bd6f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1294,20 +1294,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return ent; } +/* + * No necessary locking here. The worst thing which can + * happen is loosing events consumed at the same time + * by a trace_pipe reader. + * Other than that, we don't risk to crash the ring buffer + * because it serializes the readers. + * + * The current tracer is copied to avoid a global locking + * all around. + */ static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; + static struct tracer *old_tracer; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; int cpu; + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - - if (!current_trace || current_trace != iter->trace) { - mutex_unlock(&trace_types_lock); - return NULL; + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; } + mutex_unlock(&trace_types_lock); atomic_inc(&trace_record_cmdline_disabled); @@ -1341,7 +1353,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); - mutex_unlock(&trace_types_lock); } static void print_lat_help_header(struct seq_file *m) @@ -1691,13 +1702,25 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) goto out; } + /* + * We make a copy of the current tracer to avoid concurrent + * changes on it while we are reading. + */ mutex_lock(&trace_types_lock); + iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); + if (!iter->trace) { + *ret = -ENOMEM; + goto fail; + } + if (current_trace) + *iter->trace = *current_trace; + if (current_trace && current_trace->print_max) iter->tr = &max_tr; else iter->tr = &global_trace; - iter->trace = current_trace; iter->pos = -1; + mutex_init(&iter->mutex); iter->cpu_file = cpu_file; /* Notify the tracer early; before we stop tracing. */ @@ -1747,8 +1770,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } -fail: + fail: mutex_unlock(&trace_types_lock); + kfree(iter->trace); kfree(iter); return ERR_PTR(-ENOMEM); @@ -1783,6 +1807,8 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); seq_release(inode, file); + mutex_destroy(&iter->mutex); + kfree(iter->trace); kfree(iter); return 0; } @@ -2392,10 +2418,21 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) goto out; } + /* + * We make a copy of the current tracer to avoid concurrent + * changes on it while we are reading. + */ + iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); + if (!iter->trace) { + ret = -ENOMEM; + goto fail; + } + if (current_trace) + *iter->trace = *current_trace; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { - kfree(iter); ret = -ENOMEM; - goto out; + goto fail; } /* trace pipe does not show start of buffer */ @@ -2403,7 +2440,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->cpu_file = cpu_file; iter->tr = &global_trace; - iter->trace = current_trace; + mutex_init(&iter->mutex); filp->private_data = iter; if (iter->trace->pipe_open) @@ -2412,6 +2449,12 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) out: mutex_unlock(&trace_types_lock); return ret; + +fail: + kfree(iter->trace); + kfree(iter); + mutex_unlock(&trace_types_lock); + return ret; } static int tracing_release_pipe(struct inode *inode, struct file *file) @@ -2428,6 +2471,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); + mutex_destroy(&iter->mutex); + kfree(iter->trace); kfree(iter); return 0; @@ -2497,18 +2542,15 @@ static int tracing_wait_pipe(struct file *filp) return -EAGAIN; } - mutex_unlock(&trace_types_lock); + mutex_unlock(&iter->mutex); iter->trace->wait_pipe(iter); - mutex_lock(&trace_types_lock); + mutex_lock(&iter->mutex); if (signal_pending(current)) return -EINTR; - if (iter->trace != current_trace) - return 0; - /* * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never @@ -2533,6 +2575,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; + static struct tracer *old_tracer; ssize_t sret; /* return any leftover data */ @@ -2542,7 +2585,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, trace_seq_reset(&iter->seq); + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; + } + mutex_unlock(&trace_types_lock); + + /* + * Avoid more than one consumer on a single file descriptor + * This is just a matter of traces coherency, the ring buffer itself + * is protected. + */ + mutex_lock(&iter->mutex); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) @@ -2599,7 +2655,7 @@ waitagain: goto waitagain; out: - mutex_unlock(&trace_types_lock); + mutex_unlock(&iter->mutex); return sret; } @@ -2676,11 +2732,20 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; + static struct tracer *old_tracer; ssize_t ret; size_t rem; unsigned int i; + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; + } + mutex_unlock(&trace_types_lock); + + mutex_lock(&iter->mutex); if (iter->trace->splice_read) { ret = iter->trace->splice_read(iter, filp, @@ -2720,14 +2785,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_reset(&iter->seq); } - mutex_unlock(&trace_types_lock); + mutex_unlock(&iter->mutex); spd.nr_pages = i; return splice_to_pipe(pipe, &spd); out_err: - mutex_unlock(&trace_types_lock); + mutex_unlock(&iter->mutex); return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 508235a39da6..632191770aac 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -405,8 +405,9 @@ struct trace_iterator { struct trace_array *tr; struct tracer *trace; void *private; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; int cpu_file; + struct mutex mutex; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; /* The below is zeroed out in pipe_read */ struct trace_seq seq; -- cgit v1.2.3-59-g8ed1b From 8969a5ede0f9e17da4b943712429aef2c9bcd82b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2009 13:59:47 +0100 Subject: generic-ipi: remove kmalloc() Remove the use of kmalloc() from the smp_call_function_*() calls. Steven's generic-ipi patch (d7240b98: generic-ipi: use per cpu data for single cpu ipi calls) started the discussion on the use of kmalloc() in this code and fixed the smp_call_function_single(.wait=0) fallback case. In this patch we complete this by also providing means for the _many() call, which fully removes the need for kmalloc() in this code. The problem with the _many() call is that other cpus might still be observing our entry when we're done with it. It solved this by dynamically allocating data elements and RCU-freeing it. We solve it by using a single per-cpu entry which provides static storage and solves one half of the problem (avoiding referencing freed data). The other half, ensuring the queue iteration it still possible, is done by placing re-used entries at the head of the list. This means that if someone was still iterating that entry when it got moved, he will now re-visit the entries on the list he had already seen, but avoids skipping over entries like would have happened had we placed the new entry at the end. Furthermore, visiting entries twice is not a problem, since we remove our cpu from the entry's cpumask once its called. Many thanks to Oleg for his suggestions and him poking holes in my earlier attempts. Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Nick Piggin Cc: Jens Axboe Cc: "Paul E. McKenney" Cc: Rusty Russell Signed-off-by: Ingo Molnar --- kernel/smp.c | 264 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 166 insertions(+), 98 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 6ecf4b9895d4..7a0ce25829dc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -10,23 +10,28 @@ #include #include #include +#include static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); -static LIST_HEAD(call_function_queue); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +static struct { + struct list_head queue; + spinlock_t lock; +} call_function __cacheline_aligned_in_smp = { + .queue = LIST_HEAD_INIT(call_function.queue), + .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), +}; enum { CSD_FLAG_WAIT = 0x01, - CSD_FLAG_ALLOC = 0x02, - CSD_FLAG_LOCK = 0x04, + CSD_FLAG_LOCK = 0x02, }; struct call_function_data { struct call_single_data csd; spinlock_t lock; unsigned int refs; - struct rcu_head rcu_head; - unsigned long cpumask_bits[]; + cpumask_var_t cpumask; }; struct call_single_queue { @@ -34,8 +39,45 @@ struct call_single_queue { spinlock_t lock; }; +static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { + .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), +}; + +static int +hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return NOTIFY_BAD; + break; + +#ifdef CONFIG_CPU_HOTPLUG + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_cpumask_var(cfd->cpumask); + break; +#endif + }; + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { + .notifier_call = hotplug_cfd, +}; + static int __cpuinit init_call_single_data(void) { + void *cpu = (void *)(long)smp_processor_id(); int i; for_each_possible_cpu(i) { @@ -44,18 +86,69 @@ static int __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + + hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); + register_cpu_notifier(&hotplug_cfd_notifier); + return 0; } early_initcall(init_call_single_data); -static void csd_flag_wait(struct call_single_data *data) +/* + * csd_wait/csd_complete are used for synchronous ipi calls + */ +static void csd_wait_prepare(struct call_single_data *data) { - /* Wait for response */ - do { - if (!(data->flags & CSD_FLAG_WAIT)) - break; + data->flags |= CSD_FLAG_WAIT; +} + +static void csd_complete(struct call_single_data *data) +{ + if (data->flags & CSD_FLAG_WAIT) { + /* + * ensure we're all done before saying we are + */ + smp_mb(); + data->flags &= ~CSD_FLAG_WAIT; + } +} + +static void csd_wait(struct call_single_data *data) +{ + while (data->flags & CSD_FLAG_WAIT) cpu_relax(); - } while (1); +} + +/* + * csd_lock/csd_unlock used to serialize access to per-cpu csd resources + * + * For non-synchronous ipi calls the csd can still be in use by the previous + * function call. For multi-cpu calls its even more interesting as we'll have + * to ensure no other cpu is observing our csd. + */ +static void csd_lock(struct call_single_data *data) +{ + while (data->flags & CSD_FLAG_LOCK) + cpu_relax(); + data->flags = CSD_FLAG_LOCK; + + /* + * prevent CPU from reordering the above assignment to ->flags + * with any subsequent assignments to other fields of the + * specified call_single_data structure. + */ + + smp_mb(); +} + +static void csd_unlock(struct call_single_data *data) +{ + WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + /* + * ensure we're all done before releasing data + */ + smp_mb(); + data->flags &= ~CSD_FLAG_LOCK; } /* @@ -89,16 +182,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data) arch_send_call_function_single_ipi(cpu); if (wait) - csd_flag_wait(data); -} - -static void rcu_free_call_data(struct rcu_head *head) -{ - struct call_function_data *data; - - data = container_of(head, struct call_function_data, rcu_head); - - kfree(data); + csd_wait(data); } /* @@ -122,41 +206,35 @@ void generic_smp_call_function_interrupt(void) * It's ok to use list_for_each_rcu() here even though we may delete * 'pos', since list_del_rcu() doesn't clear ->next */ - rcu_read_lock(); - list_for_each_entry_rcu(data, &call_function_queue, csd.list) { + list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) + spin_lock(&data->lock); + if (!cpumask_test_cpu(cpu, data->cpumask)) { + spin_unlock(&data->lock); continue; + } + cpumask_clear_cpu(cpu, data->cpumask); + spin_unlock(&data->lock); data->csd.func(data->csd.info); spin_lock(&data->lock); - cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); - data->refs--; - refs = data->refs; + refs = --data->refs; + if (!refs) { + spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + spin_unlock(&call_function.lock); + } spin_unlock(&data->lock); if (refs) continue; - spin_lock(&call_function_lock); - list_del_rcu(&data->csd.list); - spin_unlock(&call_function_lock); - - if (data->csd.flags & CSD_FLAG_WAIT) { - /* - * serialize stores to data with the flag clear - * and wakeup - */ - smp_wmb(); - data->csd.flags &= ~CSD_FLAG_WAIT; - } - if (data->csd.flags & CSD_FLAG_ALLOC) - call_rcu(&data->rcu_head, rcu_free_call_data); + csd_complete(&data->csd); + csd_unlock(&data->csd); } - rcu_read_unlock(); put_cpu(); } @@ -192,14 +270,14 @@ void generic_smp_call_function_single_interrupt(void) data->func(data->info); - if (data_flags & CSD_FLAG_WAIT) { - smp_wmb(); - data->flags &= ~CSD_FLAG_WAIT; - } else if (data_flags & CSD_FLAG_LOCK) { - smp_wmb(); - data->flags &= ~CSD_FLAG_LOCK; - } else if (data_flags & CSD_FLAG_ALLOC) - kfree(data); + if (data_flags & CSD_FLAG_WAIT) + csd_complete(data); + + /* + * Unlocked CSDs are valid through generic_exec_single() + */ + if (data_flags & CSD_FLAG_LOCK) + csd_unlock(data); } } @@ -218,7 +296,9 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { - struct call_single_data d; + struct call_single_data d = { + .flags = 0, + }; unsigned long flags; /* prevent preemption and reschedule on another processor, as well as CPU removal */ @@ -239,13 +319,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, /* * We are calling a function on a single CPU * and we are not going to wait for it to finish. - * We first try to allocate the data, but if we - * fail, we fall back to use a per cpu data to pass - * the information to that CPU. Since all callers - * of this code will use the same data, we must - * synchronize the callers to prevent a new caller - * from corrupting the data before the callee - * can access it. + * We use a per cpu data to pass the information to + * that CPU. Since all callers of this code will + * use the same data, we must synchronize the + * callers to prevent a new caller from corrupting + * the data before the callee can access it. * * The CSD_FLAG_LOCK is used to let us know when * the IPI handler is done with the data. @@ -255,18 +333,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, * will make sure the callee is done with the * data before a new caller will use it. */ - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->flags = CSD_FLAG_ALLOC; - else { - data = &per_cpu(csd_data, me); - while (data->flags & CSD_FLAG_LOCK) - cpu_relax(); - data->flags = CSD_FLAG_LOCK; - } + data = &__get_cpu_var(csd_data); + csd_lock(data); } else { data = &d; - data->flags = CSD_FLAG_WAIT; + csd_wait_prepare(data); } data->func = func; @@ -326,14 +397,14 @@ void smp_call_function_many(const struct cpumask *mask, { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu; + int cpu, next_cpu, me = smp_processor_id(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == smp_processor_id()) + if (cpu == me) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); /* No online cpus? We're done. */ if (cpu >= nr_cpu_ids) @@ -341,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask, /* Do we have another CPU which isn't us? */ next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (next_cpu == smp_processor_id()) + if (next_cpu == me) next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); /* Fastpath: do that cpu by itself. */ @@ -350,31 +421,28 @@ void smp_call_function_many(const struct cpumask *mask, return; } - data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); - if (unlikely(!data)) { - /* Slow path. */ - for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) - continue; - if (cpumask_test_cpu(cpu, mask)) - smp_call_function_single(cpu, func, info, wait); - } - return; - } + data = &__get_cpu_var(cfd_data); + csd_lock(&data->csd); - spin_lock_init(&data->lock); - data->csd.flags = CSD_FLAG_ALLOC; + spin_lock_irqsave(&data->lock, flags); if (wait) - data->csd.flags |= CSD_FLAG_WAIT; + csd_wait_prepare(&data->csd); + data->csd.func = func; data->csd.info = info; - cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); - data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(me, data->cpumask); + data->refs = cpumask_weight(data->cpumask); - spin_lock_irqsave(&call_function_lock, flags); - list_add_tail_rcu(&data->csd.list, &call_function_queue); - spin_unlock_irqrestore(&call_function_lock, flags); + spin_lock(&call_function.lock); + /* + * Place entry at the _HEAD_ of the list, so that any cpu still + * observing the entry in generic_smp_call_function_interrupt() will + * not miss any other list entries. + */ + list_add_rcu(&data->csd.list, &call_function.queue); + spin_unlock(&call_function.lock); + spin_unlock_irqrestore(&data->lock, flags); /* * Make the list addition visible before sending the ipi. @@ -384,11 +452,11 @@ void smp_call_function_many(const struct cpumask *mask, smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); + arch_send_call_function_ipi_mask(data->cpumask); /* optionally wait for the CPUs to complete */ if (wait) - csd_flag_wait(&data->csd); + csd_wait(&data->csd); } EXPORT_SYMBOL(smp_call_function_many); @@ -418,20 +486,20 @@ EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function_lock); + spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function_lock); + spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function_lock); + spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function_lock); + spin_unlock_irq(&call_function.lock); } -- cgit v1.2.3-59-g8ed1b From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2009 13:59:48 +0100 Subject: generic-ipi: remove CSD_FLAG_WAIT Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework the code so that we can use CSD_FLAG_LOCK for both purposes. Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Nick Piggin Cc: Jens Axboe Cc: "Paul E. McKenney" Cc: Rusty Russell Signed-off-by: Ingo Molnar --- block/blk-softirq.c | 2 +- include/linux/smp.h | 3 +- kernel/sched.c | 2 +- kernel/smp.c | 90 ++++++++++++++--------------------------------------- kernel/softirq.c | 2 +- 5 files changed, 28 insertions(+), 71 deletions(-) diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ce0efc6b26dc..ee9c21602228 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq) data->info = rq; data->flags = 0; - __smp_call_function_single(cpu, data); + __smp_call_function_single(cpu, data, 0); return 0; } diff --git a/include/linux/smp.h b/include/linux/smp.h index 715196b09d67..00866d7fdf34 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -82,7 +82,8 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info, return 0; } -void __smp_call_function_single(int cpuid, struct call_single_data *data); +void __smp_call_function_single(int cpuid, struct call_single_data *data, + int wait); /* * Generic and arch helpers diff --git a/kernel/sched.c b/kernel/sched.c index 410eec404133..d4c2749a2998 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } diff --git a/kernel/smp.c b/kernel/smp.c index 7a0ce25829dc..f5308258891a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,8 +23,7 @@ static struct { }; enum { - CSD_FLAG_WAIT = 0x01, - CSD_FLAG_LOCK = 0x02, + CSD_FLAG_LOCK = 0x01, }; struct call_function_data { @@ -94,31 +93,6 @@ static int __cpuinit init_call_single_data(void) } early_initcall(init_call_single_data); -/* - * csd_wait/csd_complete are used for synchronous ipi calls - */ -static void csd_wait_prepare(struct call_single_data *data) -{ - data->flags |= CSD_FLAG_WAIT; -} - -static void csd_complete(struct call_single_data *data) -{ - if (data->flags & CSD_FLAG_WAIT) { - /* - * ensure we're all done before saying we are - */ - smp_mb(); - data->flags &= ~CSD_FLAG_WAIT; - } -} - -static void csd_wait(struct call_single_data *data) -{ - while (data->flags & CSD_FLAG_WAIT) - cpu_relax(); -} - /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * @@ -126,10 +100,15 @@ static void csd_wait(struct call_single_data *data) * function call. For multi-cpu calls its even more interesting as we'll have * to ensure no other cpu is observing our csd. */ -static void csd_lock(struct call_single_data *data) +static void csd_lock_wait(struct call_single_data *data) { while (data->flags & CSD_FLAG_LOCK) cpu_relax(); +} + +static void csd_lock(struct call_single_data *data) +{ + csd_lock_wait(data); data->flags = CSD_FLAG_LOCK; /* @@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data) * Insert a previously allocated call_single_data element for execution * on the given CPU. data must already have ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *data) +static +void generic_exec_single(int cpu, struct call_single_data *data, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - int wait = data->flags & CSD_FLAG_WAIT, ipi; unsigned long flags; + int ipi; spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); @@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data) arch_send_call_function_single_ipi(cpu); if (wait) - csd_wait(data); + csd_lock_wait(data); } /* @@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void) if (refs) continue; - csd_complete(&data->csd); csd_unlock(&data->csd); } @@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void) data->func(data->info); - if (data_flags & CSD_FLAG_WAIT) - csd_complete(data); - /* * Unlocked CSDs are valid through generic_exec_single() */ @@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, func(info); local_irq_restore(flags); } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data; + struct call_single_data *data = &d; - if (!wait) { - /* - * We are calling a function on a single CPU - * and we are not going to wait for it to finish. - * We use a per cpu data to pass the information to - * that CPU. Since all callers of this code will - * use the same data, we must synchronize the - * callers to prevent a new caller from corrupting - * the data before the callee can access it. - * - * The CSD_FLAG_LOCK is used to let us know when - * the IPI handler is done with the data. - * The first caller will set it, and the callee - * will clear it. The next caller must wait for - * it to clear before we set it again. This - * will make sure the callee is done with the - * data before a new caller will use it. - */ + if (!wait) data = &__get_cpu_var(csd_data); - csd_lock(data); - } else { - data = &d; - csd_wait_prepare(data); - } + + csd_lock(data); data->func = func; data->info = info; - generic_exec_single(cpu, data); + generic_exec_single(cpu, data, wait); } else { err = -ENXIO; /* CPU not online */ } @@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single); * instance. * */ -void __smp_call_function_single(int cpu, struct call_single_data *data) +void __smp_call_function_single(int cpu, struct call_single_data *data, + int wait) { + csd_lock(data); + /* Can deadlock when called with interrupts disabled */ - WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); + WARN_ON(wait && irqs_disabled()); - generic_exec_single(cpu, data); + generic_exec_single(cpu, data, wait); } /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ @@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask, csd_lock(&data->csd); spin_lock_irqsave(&data->lock, flags); - if (wait) - csd_wait_prepare(&data->csd); - data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); @@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask, /* optionally wait for the CPUs to complete */ if (wait) - csd_wait(&data->csd); + csd_lock_wait(&data->csd); } EXPORT_SYMBOL(smp_call_function_many); diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..48c3d5d627a8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir cp->flags = 0; cp->priv = softirq; - __smp_call_function_single(cpu, cp); + __smp_call_function_single(cpu, cp, 0); return 0; } return 1; -- cgit v1.2.3-59-g8ed1b From 0b13fda1e0936b3d64c4c407f183d33fa6bd2ad4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Feb 2009 16:52:11 +0100 Subject: generic-ipi: cleanups Andrew pointed out that there's some small amount of style rot in kernel/smp.c. Clean it up. Reported-by: Andrew Morton Cc: Nick Piggin Cc: Jens Axboe Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/smp.c | 162 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 86 insertions(+), 76 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index f5308258891a..7ad2262d2eca 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -2,13 +2,12 @@ * Generic helpers for smp ipi calls * * (C) Jens Axboe 2008 - * */ -#include -#include -#include #include #include +#include +#include +#include #include #include @@ -17,29 +16,30 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); static struct { struct list_head queue; spinlock_t lock; -} call_function __cacheline_aligned_in_smp = { - .queue = LIST_HEAD_INIT(call_function.queue), - .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), -}; +} call_function __cacheline_aligned_in_smp = + { + .queue = LIST_HEAD_INIT(call_function.queue), + .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + }; enum { CSD_FLAG_LOCK = 0x01, }; struct call_function_data { - struct call_single_data csd; - spinlock_t lock; - unsigned int refs; - cpumask_var_t cpumask; + struct call_single_data csd; + spinlock_t lock; + unsigned int refs; + cpumask_var_t cpumask; }; struct call_single_queue { - struct list_head list; - spinlock_t lock; + struct list_head list; + spinlock_t lock; }; static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), + .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), }; static int @@ -71,7 +71,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) } static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { - .notifier_call = hotplug_cfd, + .notifier_call = hotplug_cfd, }; static int __cpuinit init_call_single_data(void) @@ -96,9 +96,9 @@ early_initcall(init_call_single_data); /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * - * For non-synchronous ipi calls the csd can still be in use by the previous - * function call. For multi-cpu calls its even more interesting as we'll have - * to ensure no other cpu is observing our csd. + * For non-synchronous ipi calls the csd can still be in use by the + * previous function call. For multi-cpu calls its even more interesting + * as we'll have to ensure no other cpu is observing our csd. */ static void csd_lock_wait(struct call_single_data *data) { @@ -112,27 +112,29 @@ static void csd_lock(struct call_single_data *data) data->flags = CSD_FLAG_LOCK; /* - * prevent CPU from reordering the above assignment to ->flags - * with any subsequent assignments to other fields of the - * specified call_single_data structure. + * prevent CPU from reordering the above assignment + * to ->flags with any subsequent assignments to other + * fields of the specified call_single_data structure: */ - smp_mb(); } static void csd_unlock(struct call_single_data *data) { WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + /* - * ensure we're all done before releasing data + * ensure we're all done before releasing data: */ smp_mb(); + data->flags &= ~CSD_FLAG_LOCK; } /* - * Insert a previously allocated call_single_data element for execution - * on the given CPU. data must already have ->func, ->info, and ->flags set. + * Insert a previously allocated call_single_data element + * for execution on the given CPU. data must already have + * ->func, ->info, and ->flags set. */ static void generic_exec_single(int cpu, struct call_single_data *data, int wait) @@ -154,10 +156,9 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait) * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT - * locking and barrier primitives. Generic code isn't really equipped - * to do the right thing... + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... */ - if (ipi) arch_send_call_function_single_ipi(cpu); @@ -183,8 +184,8 @@ void generic_smp_call_function_interrupt(void) smp_mb(); /* - * It's ok to use list_for_each_rcu() here even though we may delete - * 'pos', since list_del_rcu() doesn't clear ->next + * It's ok to use list_for_each_rcu() here even though we may + * delete 'pos', since list_del_rcu() doesn't clear ->next */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; @@ -219,14 +220,14 @@ void generic_smp_call_function_interrupt(void) } /* - * Invoked by arch to handle an IPI for call function single. Must be called - * from the arch with interrupts disabled. + * Invoked by arch to handle an IPI for call function single. Must be + * called from the arch with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); unsigned int data_flags; + LIST_HEAD(list); spin_lock(&q->lock); list_replace_init(&q->list, &list); @@ -235,22 +236,20 @@ void generic_smp_call_function_single_interrupt(void) while (!list_empty(&list)) { struct call_single_data *data; - data = list_entry(list.next, struct call_single_data, - list); + data = list_entry(list.next, struct call_single_data, list); list_del(&data->list); /* - * 'data' can be invalid after this call if - * flags == 0 (when called through - * generic_exec_single(), so save them away before - * making the call. + * 'data' can be invalid after this call if flags == 0 + * (when called through generic_exec_single()), + * so save them away before making the call: */ data_flags = data->flags; data->func(data->info); /* - * Unlocked CSDs are valid through generic_exec_single() + * Unlocked CSDs are valid through generic_exec_single(): */ if (data_flags & CSD_FLAG_LOCK) csd_unlock(data); @@ -276,34 +275,41 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, .flags = 0, }; unsigned long flags; - /* prevent preemption and reschedule on another processor, - as well as CPU removal */ - int me = get_cpu(); + int this_cpu; int err = 0; + /* + * prevent preemption and reschedule on another processor, + * as well as CPU removal + */ + this_cpu = get_cpu(); + /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - if (cpu == me) { + if (cpu == this_cpu) { local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data = &d; + } else { + if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { + struct call_single_data *data = &d; - if (!wait) - data = &__get_cpu_var(csd_data); + if (!wait) + data = &__get_cpu_var(csd_data); - csd_lock(data); + csd_lock(data); - data->func = func; - data->info = info; - generic_exec_single(cpu, data, wait); - } else { - err = -ENXIO; /* CPU not online */ + data->func = func; + data->info = info; + generic_exec_single(cpu, data, wait); + } else { + err = -ENXIO; /* CPU not online */ + } } put_cpu(); + return err; } EXPORT_SYMBOL(smp_call_function_single); @@ -313,10 +319,9 @@ EXPORT_SYMBOL(smp_call_function_single); * @cpu: The CPU to run on. * @data: Pre-allocated and setup data structure * - * Like smp_call_function_single(), but allow caller to pass in a pre-allocated - * data structure. Useful for embedding @data inside other structures, for - * instance. - * + * Like smp_call_function_single(), but allow caller to pass in a + * pre-allocated data structure. Useful for embedding @data inside + * other structures, for instance. */ void __smp_call_function_single(int cpu, struct call_single_data *data, int wait) @@ -329,10 +334,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ + #ifndef arch_send_call_function_ipi_mask -#define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) +# define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) #endif /** @@ -340,7 +346,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since @@ -351,27 +358,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *), void *info, - bool wait) + void (*func)(void *), void *info, bool wait) { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu, me = smp_processor_id(); + int cpu, next_cpu, this_cpu = smp_processor_id(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); - /* So, what's a CPU they want? Ignoring this one. */ + /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == me) + if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ if (cpu >= nr_cpu_ids) return; /* Do we have another CPU which isn't us? */ next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (next_cpu == me) + if (next_cpu == this_cpu) next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); /* Fastpath: do that cpu by itself. */ @@ -387,30 +394,31 @@ void smp_call_function_many(const struct cpumask *mask, data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(me, data->cpumask); + cpumask_clear_cpu(this_cpu, data->cpumask); data->refs = cpumask_weight(data->cpumask); spin_lock(&call_function.lock); /* * Place entry at the _HEAD_ of the list, so that any cpu still - * observing the entry in generic_smp_call_function_interrupt() will - * not miss any other list entries. + * observing the entry in generic_smp_call_function_interrupt() + * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); spin_unlock(&call_function.lock); + spin_unlock_irqrestore(&data->lock, flags); /* * Make the list addition visible before sending the ipi. - * (IPIs must obey or appear to obey normal Linux cache coherency - * rules -- see comment in generic_exec_single). + * (IPIs must obey or appear to obey normal Linux cache + * coherency rules -- see comment in generic_exec_single). */ smp_mb(); /* Send a message to all CPUs in the map */ arch_send_call_function_ipi_mask(data->cpumask); - /* optionally wait for the CPUs to complete */ + /* Optionally wait for the CPUs to complete */ if (wait) csd_lock_wait(&data->csd); } @@ -420,7 +428,8 @@ EXPORT_SYMBOL(smp_call_function_many); * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * Returns 0. * @@ -436,6 +445,7 @@ int smp_call_function(void (*func)(void *), void *info, int wait) preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); + return 0; } EXPORT_SYMBOL(smp_call_function); -- cgit v1.2.3-59-g8ed1b From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Feb 2009 15:49:52 -0500 Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it TRACE_FORMAT. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 2 +- include/trace/sched_event_types.h | 26 +++++++++++++------------- kernel/trace/trace_events.h | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 34ae464effff..3de09fa8e01d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_sched(); } -#define DEFINE_TRACE_FMT(name, proto, args, fmt) \ +#define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args)) #endif diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index a4f662940f4e..a3d3d66a51c8 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -1,72 +1,72 @@ /* use instead */ -#ifndef DEFINE_TRACE_FMT +#ifndef TRACE_FORMAT # error Do not include this file directly. # error Unless you know what you are doing. #endif -DEFINE_TRACE_FMT(sched_kthread_stop, +TRACE_FORMAT(sched_kthread_stop, TPPROTO(struct task_struct *t), TPARGS(t), TPFMT("task %s:%d", t->comm, t->pid)); -DEFINE_TRACE_FMT(sched_kthread_stop_ret, +TRACE_FORMAT(sched_kthread_stop_ret, TPPROTO(int ret), TPARGS(ret), TPFMT("ret=%d", ret)); -DEFINE_TRACE_FMT(sched_wait_task, +TRACE_FORMAT(sched_wait_task, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p), TPFMT("task %s:%d", p->comm, p->pid)); -DEFINE_TRACE_FMT(sched_wakeup, +TRACE_FORMAT(sched_wakeup, TPPROTO(struct rq *rq, struct task_struct *p, int success), TPARGS(rq, p, success), TPFMT("task %s:%d %s", p->comm, p->pid, success?"succeeded":"failed")); -DEFINE_TRACE_FMT(sched_wakeup_new, +TRACE_FORMAT(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p, int success), TPARGS(rq, p, success), TPFMT("task %s:%d", p->comm, p->pid, success?"succeeded":"failed")); -DEFINE_TRACE_FMT(sched_switch, +TRACE_FORMAT(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TPARGS(rq, prev, next), TPFMT("task %s:%d ==> %s:%d", prev->comm, prev->pid, next->comm, next->pid)); -DEFINE_TRACE_FMT(sched_migrate_task, +TRACE_FORMAT(sched_migrate_task, TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), TPARGS(p, orig_cpu, dest_cpu), TPFMT("task %s:%d from: %d to: %d", p->comm, p->pid, orig_cpu, dest_cpu)); -DEFINE_TRACE_FMT(sched_process_free, +TRACE_FORMAT(sched_process_free, TPPROTO(struct task_struct *p), TPARGS(p), TPFMT("task %s:%d", p->comm, p->pid)); -DEFINE_TRACE_FMT(sched_process_exit, +TRACE_FORMAT(sched_process_exit, TPPROTO(struct task_struct *p), TPARGS(p), TPFMT("task %s:%d", p->comm, p->pid)); -DEFINE_TRACE_FMT(sched_process_wait, +TRACE_FORMAT(sched_process_wait, TPPROTO(struct pid *pid), TPARGS(pid), TPFMT("pid %d", pid)); -DEFINE_TRACE_FMT(sched_process_fork, +TRACE_FORMAT(sched_process_fork, TPPROTO(struct task_struct *parent, struct task_struct *child), TPARGS(parent, child), TPFMT("parent %s:%d child %s:%d", parent->comm, parent->pid, child->comm, child->pid)); -DEFINE_TRACE_FMT(sched_signal_send, +TRACE_FORMAT(sched_signal_send, TPPROTO(int sig, struct task_struct *p), TPARGS(sig, p), TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid)); diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h index cb8455b3ac9a..deb95e5006c8 100644 --- a/kernel/trace/trace_events.h +++ b/kernel/trace/trace_events.h @@ -17,8 +17,8 @@ struct ftrace_event_call { #undef TPFMT #define TPFMT(fmt, args...) fmt "\n", ##args -#undef DEFINE_TRACE_FMT -#define DEFINE_TRACE_FMT(call, proto, args, fmt) \ +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ { \ event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ -- cgit v1.2.3-59-g8ed1b From 3cdfdf91fcc77cfc82592e2b5c2ab35abe819c41 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Feb 2009 15:54:30 -0500 Subject: tracing: wrap arguments with PARAMS Peter Zijlstra warned that TPPROTO and TPARGS might become something other than a simple copy of itself. To prevent this from having side effects in the TRACE_FORMAT macro in tracepoint.h, we add a PARAMS() macro to be defined as just a wrapper. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 3de09fa8e01d..62d13391a240 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -153,7 +153,8 @@ static inline void tracepoint_synchronize_unregister(void) synchronize_sched(); } +#define PARAMS(args...) args #define TRACE_FORMAT(name, proto, args, fmt) \ - DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args)) + DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #endif -- cgit v1.2.3-59-g8ed1b From 4658e4ef9d252c26630268b20ceab78b3952db41 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 26 Feb 2009 11:27:23 +0800 Subject: ACPI: introduce sysfs I/F for dynamic tables SSDT tables may be loaded at runtime. create sysfs I/F for these dynamic tables in /sys/firmware/acpi/tables/dynamic/. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- drivers/acpi/system.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 391d0358a592..c8859047acfe 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -62,6 +62,7 @@ module_param_call(acpica_version, NULL, param_get_acpica_version, NULL, 0444); -------------------------------------------------------------------------- */ static LIST_HEAD(acpi_table_attr_list); static struct kobject *tables_kobj; +static struct kobject *dynamic_tables_kobj; struct acpi_table_attr { struct bin_attribute attr; @@ -128,6 +129,40 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr, return; } +static acpi_status +acpi_sysfs_table_handler(u32 event, void *table, void *context) +{ + struct acpi_table_attr *table_attr; + + switch (event) { + case ACPI_TABLE_EVENT_LOAD: + table_attr = + kzalloc(sizeof(struct acpi_table_attr), GFP_KERNEL); + if (!table_attr) + return AE_NO_MEMORY; + + acpi_table_attr_init(table_attr, table); + if (sysfs_create_bin_file(dynamic_tables_kobj, + &table_attr->attr)) { + kfree(table_attr); + return AE_ERROR; + } else + list_add_tail(&table_attr->node, + &acpi_table_attr_list); + break; + case ACPI_TABLE_EVENT_UNLOAD: + /* + * we do not need to do anything right now + * because the table is not deleted from the + * global table list when unloading it. + */ + break; + default: + return AE_BAD_PARAMETER; + } + return AE_OK; +} + static int acpi_system_sysfs_init(void) { struct acpi_table_attr *table_attr; @@ -137,7 +172,11 @@ static int acpi_system_sysfs_init(void) tables_kobj = kobject_create_and_add("tables", acpi_kobj); if (!tables_kobj) - return -ENOMEM; + goto err; + + dynamic_tables_kobj = kobject_create_and_add("dynamic", tables_kobj); + if (!dynamic_tables_kobj) + goto err_dynamic_tables; do { result = acpi_get_table_by_index(table_index, &table_header); @@ -162,8 +201,14 @@ static int acpi_system_sysfs_init(void) } } while (!result); kobject_uevent(tables_kobj, KOBJ_ADD); - - return 0; + kobject_uevent(dynamic_tables_kobj, KOBJ_ADD); + result = acpi_install_table_handler(acpi_sysfs_table_handler, NULL); + + return result == AE_OK ? 0 : -EINVAL; +err_dynamic_tables: + kobject_put(tables_kobj); +err: + return -ENOMEM; } /* -- cgit v1.2.3-59-g8ed1b From 8656e7a2fa6afcd8682990f804a2a9674568738f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Feb 2009 00:41:38 +0100 Subject: tracing/core: make the per cpu trace files in per cpu directories Impact: restructure the VFS layout of per CPU trace buffers The per cpu trace files are all in a single directory: /debug/tracing/per_cpu. In case of a large number of cpu, the content of this directory becomes messy so we create now one directory per cpu inside /debug/tracing/per_cpu which contain each their own trace_pipe and trace files. Ie: /debug/tracing$ ls -R per_cpu per_cpu: cpu0 cpu1 per_cpu/cpu0: trace trace_pipe per_cpu/cpu1: trace trace_pipe Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d8d899f3bd6f..bdaf60d3d337 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3061,28 +3061,31 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry; - /* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */ - char filename[17]; + struct dentry *entry, *d_cpu; + /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ + char cpu_dir[7]; if (cpu > 999 || cpu < 0) return; - /* per cpu trace_pipe */ - sprintf(filename, "trace_pipe%ld", cpu); + sprintf(cpu_dir, "cpu%ld", cpu); + d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + if (!d_cpu) { + pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + return; + } - entry = debugfs_create_file(filename, 0444, d_percpu, + /* per cpu trace_pipe */ + entry = debugfs_create_file("trace_pipe", 0444, d_cpu, (void *) cpu, &tracing_pipe_fops); if (!entry) - pr_warning("Could not create debugfs '%s' entry\n", filename); + pr_warning("Could not create debugfs 'trace_pipe' entry\n"); /* per cpu trace */ - sprintf(filename, "trace%ld", cpu); - - entry = debugfs_create_file(filename, 0444, d_percpu, + entry = debugfs_create_file("trace", 0444, d_cpu, (void *) cpu, &tracing_fops); if (!entry) - pr_warning("Could not create debugfs '%s' entry\n", filename); + pr_warning("Could not create debugfs 'trace' entry\n"); } #ifdef CONFIG_FTRACE_SELFTEST -- cgit v1.2.3-59-g8ed1b From af39241b90a345556b8884adff87096afe71b050 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 26 Feb 2009 10:11:05 -0500 Subject: tracing, genirq: add irq enter and exit trace events Impact: add new tracepoints Add them to the generic IRQ code, that way every architecture gets these new tracepoints, not just x86. Using Steve's new 'TRACE_FORMAT', I can get function graph trace as follows using the original two IRQ tracepoints: 3) | handle_IRQ_event() { 3) | /* (irq_handler_entry) irq=28 handler=eth0 */ 3) | e1000_intr_msi() { 3) 2.460 us | __napi_schedule(); 3) 9.416 us | } 3) | /* (irq_handler_exit) irq=28 handler=eth0 return=handled */ 3) + 22.935 us | } Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt Acked-by: Peter Zijlstra Acked-by: Masami Hiramatsu Cc: KOSAKI Motohiro Cc: Mathieu Desnoyers Cc: "Frank Ch. Eigler" Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/trace/irq.h | 9 +++++++++ include/trace/irq_event_types.h | 17 +++++++++++++++++ kernel/irq/handle.c | 6 ++++++ kernel/trace/events.c | 2 ++ 4 files changed, 34 insertions(+) create mode 100644 include/trace/irq.h create mode 100644 include/trace/irq_event_types.h diff --git a/include/trace/irq.h b/include/trace/irq.h new file mode 100644 index 000000000000..ff5d4495dc37 --- /dev/null +++ b/include/trace/irq.h @@ -0,0 +1,9 @@ +#ifndef _TRACE_IRQ_H +#define _TRACE_IRQ_H + +#include +#include + +#include + +#endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h new file mode 100644 index 000000000000..5d0919fdd2d4 --- /dev/null +++ b/include/trace/irq_event_types.h @@ -0,0 +1,17 @@ + +/* use instead */ +#ifndef TRACE_FORMAT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +TRACE_FORMAT(irq_handler_entry, + TPPROTO(int irq, struct irqaction *action), + TPARGS(irq, action), + TPFMT("irq=%d handler=%s", irq, action->name)); + +TRACE_FORMAT(irq_handler_exit, + TPPROTO(int irq, struct irqaction *action, int ret), + TPARGS(irq, action, ret), + TPFMT("irq=%d handler=%s return=%s", + irq, action->name, ret ? "handled" : "unhandled")); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3aba8d12f328..4709a7c870d7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internals.h" @@ -316,6 +317,9 @@ irqreturn_t no_action(int cpl, void *dev_id) return IRQ_NONE; } +DEFINE_TRACE(irq_handler_entry); +DEFINE_TRACE(irq_handler_exit); + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -332,7 +336,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) local_irq_enable_in_hardirq(); do { + trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, ret); if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 38c89eef99ee..3c75623893cc 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -6,8 +6,10 @@ /* trace/.h here */ #include +#include #include "trace_events.h" /* trace/_event_types.h here */ #include +#include -- cgit v1.2.3-59-g8ed1b From 6409c4da289d6905f7ae2bd0630438368439bda2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 21:21:14 +0200 Subject: sched: sched_clock() improvement: use in_nmi() make sure we dont execute more complex sched_clock() code in NMI context. Acked-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index a0b0852414cc..db69174b1178 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Scheduler clock - returns current time in nanosec units. @@ -151,6 +152,13 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd = cpu_sdc(cpu); u64 now, clock, this_clock, remote_clock; + /* + * Normally this is not called in NMI context - but if it is, + * trying to do any locking here is totally lethal. + */ + if (unlikely(in_nmi())) + return scd->clock; + if (unlikely(!sched_clock_running)) return 0ull; -- cgit v1.2.3-59-g8ed1b From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Feb 2009 18:47:11 +0100 Subject: tracing: implement trace_clock_*() APIs Impact: implement new tracing timestamp APIs Add three trace clock variants, with differing scalability/precision tradeoffs: - local: CPU-local trace clock - medium: scalable global clock with some jitter - global: globally monotonic, serialized clock Make the ring-buffer use the local trace clock internally. Acked-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/trace_clock.h | 19 +++++++++ kernel/trace/Makefile | 1 + kernel/trace/ring_buffer.c | 5 +-- kernel/trace/trace_clock.c | 101 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 include/linux/trace_clock.h create mode 100644 kernel/trace/trace_clock.c diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h new file mode 100644 index 000000000000..7a8130384087 --- /dev/null +++ b/include/linux/trace_clock.h @@ -0,0 +1,19 @@ +#ifndef _LINUX_TRACE_CLOCK_H +#define _LINUX_TRACE_CLOCK_H + +/* + * 3 trace clock variants, with differing scalability/precision + * tradeoffs: + * + * - local: CPU-local trace clock + * - medium: scalable global clock with some jitter + * - global: globally monotonic, serialized clock + */ +#include +#include + +extern u64 notrace trace_clock_local(void); +extern u64 notrace trace_clock(void); +extern u64 notrace trace_clock_global(void); + +#endif /* _LINUX_TRACE_CLOCK_H */ diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 664b6c0dc75a..c931fe0560cb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8f19f1aa42b0..a8c275c01e83 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Steven Rostedt */ #include +#include #include #include #include @@ -12,7 +13,6 @@ #include #include #include -#include /* used for sched_clock() (for now) */ #include #include #include @@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on); /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -/* FIXME!!! */ u64 ring_buffer_time_stamp(int cpu) { u64 time; preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ - time = sched_clock() << DEBUG_SHIFT; + time = trace_clock_local() << DEBUG_SHIFT; preempt_enable_no_resched_notrace(); return time; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c new file mode 100644 index 000000000000..2d4953f93560 --- /dev/null +++ b/kernel/trace/trace_clock.c @@ -0,0 +1,101 @@ +/* + * tracing clocks + * + * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar + * + * Implements 3 trace clock variants, with differing scalability/precision + * tradeoffs: + * + * - local: CPU-local trace clock + * - medium: scalable global clock with some jitter + * - global: globally monotonic, serialized clock + * + * Tracer plugins will chose a default from these clocks. + */ +#include +#include +#include +#include +#include +#include + +/* + * trace_clock_local(): the simplest and least coherent tracing clock. + * + * Useful for tracing that does not cross to other CPUs nor + * does it go through idle events. + */ +u64 notrace trace_clock_local(void) +{ + /* + * sched_clock() is an architecture implemented, fast, scalable, + * lockless clock. It is not guaranteed to be coherent across + * CPUs, nor across CPU idle events. + */ + return sched_clock(); +} + +/* + * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * but not completely incorrect when crossing CPUs either. + * + * This is based on cpu_clock(), which will allow at most ~1 jiffy of + * jitter between CPUs. So it's a pretty scalable clock, but there + * can be offsets in the trace data. + */ +u64 notrace trace_clock(void) +{ + return cpu_clock(raw_smp_processor_id()); +} + + +/* + * trace_clock_global(): special globally coherent trace clock + * + * It has higher overhead than the other trace clocks but is still + * an order of magnitude faster than GTOD derived hardware clocks. + * + * Used by plugins that need globally coherent timestamps. + */ + +static u64 prev_trace_clock_time; + +static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +u64 notrace trace_clock_global(void) +{ + unsigned long flags; + int this_cpu; + u64 now; + + raw_local_irq_save(flags); + + this_cpu = raw_smp_processor_id(); + now = cpu_clock(this_cpu); + /* + * If in an NMI context then dont risk lockups and return the + * cpu_clock() time: + */ + if (unlikely(in_nmi())) + goto out; + + __raw_spin_lock(&trace_clock_lock); + + /* + * TODO: if this happens often then maybe we should reset + * my_scd->clock to prev_trace_clock_time+1, to make sure + * we start ticking with the local clock from now on? + */ + if ((s64)(now - prev_trace_clock_time) < 0) + now = prev_trace_clock_time + 1; + + prev_trace_clock_time = now; + + __raw_spin_unlock(&trace_clock_lock); + + out: + raw_local_irq_restore(flags); + + return now; +} -- cgit v1.2.3-59-g8ed1b From ba1d755a36f66101aa88ac9ebb54694def6ec38d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 18 Oct 2008 21:24:45 +0200 Subject: fix warning in arch/x86/kernel/cpu/intel_cacheinfo.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix this warning: arch/x86/kernel/cpu/intel_cacheinfo.c:139: warning: ‘k8_nb_id’ defined but not used arch/x86/kernel/cpu/intel_cacheinfo.c:527: warning: ‘free_cache_attributes’ defined but not used arch/x86/kernel/cpu/intel_cacheinfo.c:538: warning: ‘detect_cache_attributes’ defined but not used Unused variables in the !CONFIG_SYSCTL case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 51b5dfd67163..03f93c5dcfb3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -144,7 +144,7 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#ifdef CONFIG_PCI +#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) static struct pci_device_id k8_nb_id[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, @@ -484,6 +484,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) return l2; } +#ifdef CONFIG_SYSFS + /* pointer to _cpuid4_info array (for each cache leaf) */ static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) @@ -597,8 +599,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) return retval; } -#ifdef CONFIG_SYSFS - #include #include -- cgit v1.2.3-59-g8ed1b From a8259075074fb09c230b4cd2c8d3ee3c49d6ecd1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Feb 2009 22:19:12 -0500 Subject: tracing: add options directory and core option files This patch creates an options directory in the debugfs, that contains the available tracing options. These files contain 1 or 0, where 1 is the option is enabled and 0 it is disabled. Simply echoing in 1 will enable the option and 0 will disable it. This patch only contains the core options, not the tracer options. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdaf60d3d337..40e983ed994f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3093,6 +3093,121 @@ static void tracing_init_debugfs_percpu(long cpu) #include "trace_selftest.c" #endif +static ssize_t +trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + long index = (long)filp->private_data; + char *buf; + + if (trace_flags & (1 << index)) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + long index = (long)filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + switch (val) { + case 0: + trace_flags &= ~(1 << index); + break; + case 1: + trace_flags |= 1 << index; + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + + +static const struct file_operations trace_options_core_fops = { + .open = tracing_open_generic, + .read = trace_options_core_read, + .write = trace_options_core_write, +}; + +static struct dentry *trace_options_init_dentry(void) +{ + struct dentry *d_tracer; + static struct dentry *t_options; + + if (t_options) + return t_options; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return NULL; + + t_options = debugfs_create_dir("options", d_tracer); + if (!t_options) { + pr_warning("Could not create debugfs directory 'options'\n"); + return NULL; + } + + return t_options; +} + +static struct dentry * +create_trace_option_core_file(const char *option, long index) +{ + struct dentry *t_options; + struct dentry *entry; + + t_options = trace_options_init_dentry(); + if (!t_options) + return NULL; + + entry = debugfs_create_file(option, 0644, t_options, (void *)index, + &trace_options_core_fops); + + return entry; +} + +static __init void create_trace_options_dir(void) +{ + struct dentry *t_options; + struct dentry *entry; + int i; + + t_options = trace_options_init_dentry(); + if (!t_options) + return; + + for (i = 0; trace_options[i]; i++) { + entry = create_trace_option_core_file(trace_options[i], i); + if (!entry) + pr_warning("Could not create debugfs %s entry\n", + trace_options[i]); + } +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -3111,6 +3226,8 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'trace_options' entry\n"); + create_trace_options_dir(); + entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, NULL, &tracing_cpumask_fops); if (!entry) -- cgit v1.2.3-59-g8ed1b From 577b785f55168d5acb3d123ba41bfe8d7981e044 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Feb 2009 23:43:05 -0500 Subject: tracing: add tracer dependent options to options directory This patch adds the tracer dependent options dynamically to the options directory when the tracer is activated. These options are removed when the tracer is deactivated. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 40e983ed994f..485c6e7f4465 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2275,8 +2275,17 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } +struct trace_option_dentry; + +static struct trace_option_dentry * +create_trace_option_files(struct tracer *tracer); + +static void +destroy_trace_option_files(struct trace_option_dentry *topts); + static int tracing_set_tracer(const char *buf) { + static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; int ret = 0; @@ -2297,7 +2306,12 @@ static int tracing_set_tracer(const char *buf) if (current_trace && current_trace->reset) current_trace->reset(tr); + destroy_trace_option_files(topts); + current_trace = t; + + topts = create_trace_option_files(current_trace); + if (t->init) { ret = tracer_init(t, tr); if (ret) @@ -3093,6 +3107,95 @@ static void tracing_init_debugfs_percpu(long cpu) #include "trace_selftest.c" #endif +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct dentry *entry; +}; + +static ssize_t +trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + char *buf; + + if (topt->flags->val & topt->opt->bit) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + unsigned long val; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = 0; + switch (val) { + case 0: + /* do nothing if already cleared */ + if (!(topt->flags->val & topt->opt->bit)) + break; + + mutex_lock(&trace_types_lock); + if (current_trace->set_flag) + ret = current_trace->set_flag(topt->flags->val, + topt->opt->bit, 0); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + topt->flags->val &= ~topt->opt->bit; + break; + case 1: + /* do nothing if already set */ + if (topt->flags->val & topt->opt->bit) + break; + + mutex_lock(&trace_types_lock); + if (current_trace->set_flag) + ret = current_trace->set_flag(topt->flags->val, + topt->opt->bit, 1); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + topt->flags->val |= topt->opt->bit; + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + + +static const struct file_operations trace_options_fops = { + .open = tracing_open_generic, + .read = trace_options_read, + .write = trace_options_write, +}; + static ssize_t trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3146,7 +3249,6 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } - static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, @@ -3174,6 +3276,76 @@ static struct dentry *trace_options_init_dentry(void) return t_options; } +static void +create_trace_option_file(struct trace_option_dentry *topt, + struct tracer_flags *flags, + struct tracer_opt *opt) +{ + struct dentry *t_options; + struct dentry *entry; + + t_options = trace_options_init_dentry(); + if (!t_options) + return; + + topt->flags = flags; + topt->opt = opt; + + entry = debugfs_create_file(opt->name, 0644, t_options, topt, + &trace_options_fops); + + topt->entry = entry; + +} + +static struct trace_option_dentry * +create_trace_option_files(struct tracer *tracer) +{ + struct trace_option_dentry *topts; + struct tracer_flags *flags; + struct tracer_opt *opts; + int cnt; + + if (!tracer) + return NULL; + + flags = tracer->flags; + + if (!flags || !flags->opts) + return NULL; + + opts = flags->opts; + + for (cnt = 0; opts[cnt].name; cnt++) + ; + + topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL); + if (!topts) + return NULL; + + for (cnt = 0; opts[cnt].name; cnt++) + create_trace_option_file(&topts[cnt], flags, + &opts[cnt]); + + return topts; +} + +static void +destroy_trace_option_files(struct trace_option_dentry *topts) +{ + int cnt; + + if (!topts) + return; + + for (cnt = 0; topts[cnt].opt; cnt++) { + if (topts[cnt].entry) + debugfs_remove(topts[cnt].entry); + } + + kfree(topts); +} + static struct dentry * create_trace_option_core_file(const char *option, long index) { -- cgit v1.2.3-59-g8ed1b From d8e83d26b5ab3b31ee0ff6d093a2627707a1e221 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Feb 2009 23:55:58 -0500 Subject: tracing: add protection around open use of current_tracer Impact: fix to possible race conditions There's some uses of current_tracer that is not protected by the trace_types_lock. There is a small chance that a sysadmin changes the tracer while the current_tracer is being referenced. If the race is hit, it is unlikely to cause any harm since the tracers are constant and are not freed. But some strang side effects may occur. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 485c6e7f4465..6c89ec6cbd48 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2024,12 +2024,12 @@ static ssize_t tracing_trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int i; + struct tracer_opt *trace_opts; + u32 tracer_flags; + int len = 0; char *buf; int r = 0; - int len = 0; - u32 tracer_flags = current_trace->flags->val; - struct tracer_opt *trace_opts = current_trace->flags->opts; + int i; /* calculate max size */ @@ -2038,6 +2038,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, len += 3; /* "no" and space */ } + mutex_lock(&trace_types_lock); + tracer_flags = current_trace->flags->val; + trace_opts = current_trace->flags->opts; + /* * Increase the size with names of options specific * of the current tracer. @@ -2049,8 +2053,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, /* +2 for \n and \0 */ buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) + if (!buf) { + mutex_unlock(&trace_types_lock); return -ENOMEM; + } for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) @@ -2067,6 +2073,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, r += sprintf(buf + r, "no%s ", trace_opts[i].name); } + mutex_unlock(&trace_types_lock); r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); @@ -2074,7 +2081,6 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); kfree(buf); - return r; } @@ -2149,7 +2155,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) { + mutex_lock(&trace_types_lock); ret = set_tracer_option(current_trace, cmp, neg); + mutex_unlock(&trace_types_lock); if (ret) return ret; } -- cgit v1.2.3-59-g8ed1b From 85a2f9b46f8cd8aaa11c64c715e1ea3ec27ec486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 00:12:38 -0500 Subject: tracing: use pointer error returns for __tracing_open Impact: fix compile warning and clean up When I first wrote __tracing_open, instead of passing the error code via the ERR_PTR macros, I lazily used a separate parameter to hold the return for errors. When Frederic Weisbecker updated that function, he used the Linux kernel ERR_PTR for the returns. This caused the parameter return to possibly not be initialized on error. gcc correctly pointed this out with a warning. This patch converts the entire function to use the Linux kernel ERR_PTR macro methods. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c89ec6cbd48..304e02ce39c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1684,23 +1684,20 @@ static struct seq_operations tracer_seq_ops = { }; static struct trace_iterator * -__tracing_open(struct inode *inode, struct file *file, int *ret) +__tracing_open(struct inode *inode, struct file *file) { long cpu_file = (long) inode->i_private; + void *fail_ret = ERR_PTR(-ENOMEM); struct trace_iterator *iter; struct seq_file *m; - int cpu; + int cpu, ret; - if (tracing_disabled) { - *ret = -ENODEV; - return NULL; - } + if (tracing_disabled) + return ERR_PTR(-ENODEV); iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) { - *ret = -ENOMEM; - goto out; - } + if (!iter) + return ERR_PTR(-ENOMEM); /* * We make a copy of the current tracer to avoid concurrent @@ -1708,10 +1705,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) */ mutex_lock(&trace_types_lock); iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); - if (!iter->trace) { - *ret = -ENOMEM; + if (!iter->trace) goto fail; - } + if (current_trace) *iter->trace = *current_trace; @@ -1750,9 +1746,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) } /* TODO stop tracer */ - *ret = seq_open(file, &tracer_seq_ops); - if (*ret) + ret = seq_open(file, &tracer_seq_ops); + if (ret < 0) { + fail_ret = ERR_PTR(ret); goto fail_buffer; + } m = file->private_data; m->private = iter; @@ -1762,7 +1760,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) mutex_unlock(&trace_types_lock); - out: return iter; fail_buffer: @@ -1775,7 +1772,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) kfree(iter->trace); kfree(iter); - return ERR_PTR(-ENOMEM); + return fail_ret; } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -1815,9 +1812,12 @@ static int tracing_release(struct inode *inode, struct file *file) static int tracing_open(struct inode *inode, struct file *file) { - int ret; + struct trace_iterator *iter; + int ret = 0; - __tracing_open(inode, file, &ret); + iter = __tracing_open(inode, file); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); return ret; } @@ -1825,11 +1825,13 @@ static int tracing_open(struct inode *inode, struct file *file) static int tracing_lt_open(struct inode *inode, struct file *file) { struct trace_iterator *iter; - int ret; + int ret = 0; - iter = __tracing_open(inode, file, &ret); + iter = __tracing_open(inode, file); - if (!ret) + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else iter->iter_flags |= TRACE_FILE_LAT_FMT; return ret; -- cgit v1.2.3-59-g8ed1b From 5c6a3ae1b4beebb56e2916b84f1208d96a9e32ff Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 00:22:21 -0500 Subject: tracing: use newline separator for trace options list Impact: clean up Instead of listing the trace options like: # cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin noblock nostacktrace nosched-tree ftrace_printk noftrace_preempt nobranch annotate nouserstacktrace nosym-userobj We now list them like: # cat /debug/tracing/trace_options print-parent nosym-offset nosym-addr noverbose noraw nohex nobin noblock nostacktrace nosched-tree ftrace_printk noftrace_preempt nobranch annotate nouserstacktrace nosym-userobj Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 304e02ce39c4..5db7485158df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2037,7 +2037,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, /* calculate max size */ for (i = 0; trace_options[i]; i++) { len += strlen(trace_options[i]); - len += 3; /* "no" and space */ + len += 3; /* "no" and newline */ } mutex_lock(&trace_types_lock); @@ -2050,7 +2050,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, */ for (i = 0; trace_opts[i].name; i++) { len += strlen(trace_opts[i].name); - len += 3; /* "no" and space */ + len += 3; /* "no" and newline */ } /* +2 for \n and \0 */ @@ -2062,22 +2062,21 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf, for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) - r += sprintf(buf + r, "%s ", trace_options[i]); + r += sprintf(buf + r, "%s\n", trace_options[i]); else - r += sprintf(buf + r, "no%s ", trace_options[i]); + r += sprintf(buf + r, "no%s\n", trace_options[i]); } for (i = 0; trace_opts[i].name; i++) { if (tracer_flags & trace_opts[i].bit) - r += sprintf(buf + r, "%s ", + r += sprintf(buf + r, "%s\n", trace_opts[i].name); else - r += sprintf(buf + r, "no%s ", + r += sprintf(buf + r, "no%s\n", trace_opts[i].name); } mutex_unlock(&trace_types_lock); - r += sprintf(buf + r, "\n"); WARN_ON(r >= len + 2); r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v1.2.3-59-g8ed1b From 0cfe82451dfa3ebf4e69158f2eb450f2fbb6b715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 10:51:10 -0500 Subject: tracing: replace kzalloc with kcalloc Impact: clean up kcalloc is a better approach to allocate a NULL array. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5db7485158df..9c5987aca74b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3328,7 +3328,7 @@ create_trace_option_files(struct tracer *tracer) for (cnt = 0; opts[cnt].name; cnt++) ; - topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL); + topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); if (!topts) return NULL; -- cgit v1.2.3-59-g8ed1b From eb594e45f6979cd10b18d87f7b3f02119e00a108 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 17:36:06 -0500 Subject: tracing: move trace point formats to files in include/trace directory Impact: clean up To further facilitate the ease of adding trace points for developers, this patch creates include/trace/trace_events.h and include/trace/trace_event_types.h. The former file will hold the trace/.h files and the latter will hold the trace/_event_types.h files. To create new tracepoints and to have them automatically appear in the event tracer, a developer makes the trace/.h file which includes and the trace/_event_types.h file. The trace/_event_types.h file will hold the TRACE_FORMAT macros. Then add the trace/.h file to trace/trace_events.h, and add the trace/_event_types.h to the trace_event_types.h file. No need to modify files elsewhere. Signed-off-by: Steven Rostedt --- include/trace/trace_event_types.h | 4 ++++ include/trace/trace_events.h | 4 ++++ kernel/trace/events.c | 10 ++-------- 3 files changed, 10 insertions(+), 8 deletions(-) create mode 100644 include/trace/trace_event_types.h create mode 100644 include/trace/trace_events.h diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h new file mode 100644 index 000000000000..33c8ed5ccb6c --- /dev/null +++ b/include/trace/trace_event_types.h @@ -0,0 +1,4 @@ +/* trace/_event_types.h here */ + +#include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h new file mode 100644 index 000000000000..ea2ef2051762 --- /dev/null +++ b/include/trace/trace_events.h @@ -0,0 +1,4 @@ +/* trace/.h here */ + +#include +#include diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 3c75623893cc..46e27ad2487e 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -1,15 +1,9 @@ /* * This is the place to register all trace points as events. - * Include the trace/.h at the top. - * Include the trace/_event_types.h at the bottom. */ -/* trace/.h here */ -#include -#include +#include #include "trace_events.h" -/* trace/_event_types.h here */ -#include -#include +#include -- cgit v1.2.3-59-g8ed1b From 6ecc2d1ca39177edb6fbdb7412948b0e9f409d02 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 21:33:02 -0500 Subject: tracing: add subsystem level to trace events If a trace point header defines TRACE_SYSTEM, then it will add the following trace points into that event system. If include/trace/irq_event_types.h has: #define TRACE_SYSTEM irq at the top and #undef TRACE_SYSTEM at the bottom, then a directory "irq" will be created in the /debug/tracing/events directory. Inside that directory will contain the two trace points that are defined in include/trace/irq_event_types.h. Only adding the above to irq and not to sched, we get: # ls /debug/tracing/events/ irq sched_process_exit sched_signal_send sched_wakeup_new sched_kthread_stop sched_process_fork sched_switch sched_kthread_stop_ret sched_process_free sched_wait_task sched_migrate_task sched_process_wait sched_wakeup # ls /debug/tracing/events/irq irq_handler_entry irq_handler_exit If we add #define TRACE_SYSTEM sched to the trace/sched_event_types.h then the rest of the trace events will be put in a sched directory within the events directory. I've been playing with this idea of the subsystem for a while, but recently Tom Zanussi posted some patches to lkml that included this method. Tom's approach was clean and got me to finally put some effort to clean up the event trace points. Thanks to Tom Zanussi for demonstrating how nice the subsystem method is. Signed-off-by: Steven Rostedt --- kernel/trace/events.c | 4 ++++ kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.h | 2 ++ 3 files changed, 54 insertions(+) diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 46e27ad2487e..4e4e45860c58 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -2,6 +2,10 @@ * This is the place to register all trace points as events. */ +/* someday this needs to go in a generic header */ +#define __STR(x) #x +#define STR(x) __STR(x) + #include #include "trace_events.h" diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3bcb9df93342..19332200c457 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -345,11 +345,59 @@ static struct dentry *event_trace_events_dir(void) return d_events; } +struct event_subsystem { + struct list_head list; + const char *name; + struct dentry *entry; +}; + +static LIST_HEAD(event_subsystems); + +static struct dentry * +event_subsystem_dir(const char *name, struct dentry *d_events) +{ + struct event_subsystem *system; + + /* First see if we did not already create this dir */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + return system->entry; + } + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) { + pr_warning("No memory to create event subsystem %s\n", + name); + return d_events; + } + + system->entry = debugfs_create_dir(name, d_events); + if (!system->entry) { + pr_warning("Could not create event subsystem %s\n", + name); + kfree(system); + return d_events; + } + + system->name = name; + list_add(&system->list, &event_subsystems); + + return system->entry; +} + static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) { struct dentry *entry; + /* + * If the trace point header did not define TRACE_SYSTEM + * then the system would be called "TRACE_SYSTEM". + */ + if (strcmp(call->system, "TRACE_SYSTEM") != 0) + d_events = event_subsystem_dir(call->system, d_events); + call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h index deb95e5006c8..b015d7b19878 100644 --- a/kernel/trace/trace_events.h +++ b/kernel/trace/trace_events.h @@ -7,6 +7,7 @@ struct ftrace_event_call { char *name; + char *system; struct dentry *dir; int enabled; int (*regfunc)(void); @@ -44,6 +45,7 @@ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ + .system = STR(TRACE_SYSTEM), \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ } -- cgit v1.2.3-59-g8ed1b From b628b3e629b1436710e59a21cc020fbb04a52ce1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 23:32:58 -0500 Subject: tracing: make the set_event and available_events subsystem aware This patch makes the event files, set_event and available_events aware of the subsystem. Now you can enable an entire subsystem with: echo 'irq:*' > set_event Note: the '*' is not needed. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 19332200c457..b811eb343522 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -12,6 +12,8 @@ #include "trace_events.h" +#define TRACE_SYSTEM "TRACE_SYSTEM" + #define events_for_each(event) \ for (event = __start_ftrace_events; \ (unsigned long)event < (unsigned long)__stop_ftrace_events; \ @@ -45,14 +47,47 @@ static void ftrace_clear_events(void) static int ftrace_set_clr_event(char *buf, int set) { struct ftrace_event_call *call = __start_ftrace_events; + char *event = NULL, *sub = NULL, *match; + int ret = -EINVAL; + + /* + * The buf format can be : + * *: means any event by that name. + * : is the same. + * + * :* means all events in that subsystem + * : means the same. + * + * (no ':') means all events in a subsystem with + * the name or any event that matches + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } events_for_each(call) { if (!call->name) continue; - if (strcmp(buf, call->name) != 0) + if (match && + strcmp(match, call->name) != 0 && + strcmp(match, call->system) != 0) + continue; + + if (sub && strcmp(sub, call->system) != 0) + continue; + + if (event && strcmp(event, call->name) != 0) continue; if (set) { @@ -68,9 +103,9 @@ static int ftrace_set_clr_event(char *buf, int set) call->enabled = 0; call->unregfunc(); } - return 0; + ret = 0; } - return -EINVAL; + return ret; } /* 128 should be much more than enough */ @@ -200,6 +235,8 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_event_call *call = v; + if (strcmp(call->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->system); seq_printf(m, "%s\n", call->name); return 0; -- cgit v1.2.3-59-g8ed1b From 0ec2ef1505b3e1f54b07bf64f184c92859c3e13f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 23:41:43 -0500 Subject: tracing: add subsystem irq for irq events Add the TRACE_SYSTEM irq for the irq events. Signed-off-by: Steven Rostedt --- include/trace/irq_event_types.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 5d0919fdd2d4..47a2be1b2d7d 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -5,6 +5,9 @@ # error Unless you know what you are doing. #endif +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + TRACE_FORMAT(irq_handler_entry, TPPROTO(int irq, struct irqaction *action), TPARGS(irq, action), @@ -15,3 +18,5 @@ TRACE_FORMAT(irq_handler_exit, TPARGS(irq, action, ret), TPFMT("irq=%d handler=%s return=%s", irq, action->name, ret ? "handled" : "unhandled")); + +#undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From 3d7ba938da8481b4f7f9ed3d943dbae49389b284 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 23:45:52 -0500 Subject: tracing: add subsystem sched for sched events Add the TRACE_SYSTEM sched for the sched events. Signed-off-by: Steven Rostedt --- include/trace/sched_event_types.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index a3d3d66a51c8..2ada206565a3 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -5,6 +5,9 @@ # error Unless you know what you are doing. #endif +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched + TRACE_FORMAT(sched_kthread_stop, TPPROTO(struct task_struct *t), TPARGS(t), @@ -70,3 +73,5 @@ TRACE_FORMAT(sched_signal_send, TPPROTO(int sig, struct task_struct *p), TPARGS(sig, p), TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid)); + +#undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From ef5580d0fffce6e0a01043bac0625128b5d409a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 19:38:04 -0500 Subject: tracing: add interface to write into current tracer buffer Right now all tracers must manage their own trace buffers. This was to enforce tracers to be independent in case we finally decide to allow each tracer to have their own trace buffer. But now we are adding event tracing that writes to the current tracer's buffer. This adds an interface to allow events to write to the current tracer buffer without having to manage its own. Since event tracing has no "tracer", and is just a way to hook into any other tracer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 14 ++++++++++++++ kernel/trace/trace.h | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9c5987aca74b..c5e39cd7310d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -846,6 +846,20 @@ void trace_buffer_unlock_commit(struct trace_array *tr, trace_wake_up(); } +struct ring_buffer_event * +trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, + unsigned long flags, int pc) +{ + return trace_buffer_lock_reserve(&global_trace, + type, len, flags, pc); +} + +void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + return trace_buffer_unlock_commit(&global_trace, event, flags, pc); +} + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 632191770aac..adf161f6dd11 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -442,6 +442,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); +struct ring_buffer_event * +trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); + struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); -- cgit v1.2.3-59-g8ed1b From c32e827b25054cb17b79cf97fb5e63ae4ce2223c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 Feb 2009 19:12:30 -0500 Subject: tracing: add raw trace point recording infrastructure Impact: lower overhead tracing The current event tracer can automatically pick up trace points that are registered with the TRACE_FORMAT macro. But it required a printf format string and parsing. Although, this adds the ability to get guaranteed information like task names and such, it took a hit in overhead processing. This processing can add about 500-1000 nanoseconds overhead, but in some cases that too is considered too much and we want to shave off as much from this overhead as possible. Tom Zanussi recently posted tracing patches to lkml that are based on a nice idea about capturing the data via C structs using STRUCT_ENTER, STRUCT_EXIT type of macros. I liked that method very much, but did not like the implementation that required a developer to add data/code in several disjoint locations. This patch extends the event_tracer macros to do a similar "raw C" approach that Tom Zanussi did. But instead of having the developers needing to tweak a bunch of code all over the place, they can do it all in one macro - preferably placed near the code that it is tracing. That makes it much more likely that tracepoints will be maintained on an ongoing basis by the code they modify. The new macro TRACE_EVENT_FORMAT is created for this approach. (Note, a developer may still utilize the more low level DECLARE_TRACE macros if they don't care about getting their traces automatically in the event tracer.) They can also use the existing TRACE_FORMAT if they don't need to code the tracepoint in C, but just want to use the convenience of printf. So if the developer wants to "hardwire" a tracepoint in the fastest possible way, and wants to acquire their data via a user space utility in a raw binary format, or wants to see it in the trace output but not sacrifice any performance, then they can implement the faster but more complex TRACE_EVENT_FORMAT macro. Here's what usage looks like: TRACE_EVENT_FORMAT(name, TPPROTO(proto), TPARGS(args), TPFMT(fmt, fmt_args), TRACE_STUCT( TRACE_FIELD(type1, item1, assign1) TRACE_FIELD(type2, item2, assign2) [...] ), TPRAWFMT(raw_fmt) ); Note name, proto, args, and fmt, are all identical to what TRACE_FORMAT uses. name: is the unique identifier of the trace point proto: The proto type that the trace point uses args: the args in the proto type fmt: printf format to use with the event printf tracer fmt_args: the printf argments to match fmt TRACE_STRUCT starts the ability to create a structure. Each item in the structure is defined with a TRACE_FIELD TRACE_FIELD(type, item, assign) type: the C type of item. item: the name of the item in the stucture assign: what to assign the item in the trace point callback raw_fmt is a way to pretty print the struct. It must match the order of the items are added in TRACE_STUCT An example of this would be: TRACE_EVENT_FORMAT(sched_wakeup, TPPROTO(struct rq *rq, struct task_struct *p, int success), TPARGS(rq, p, success), TPFMT("task %s:%d %s", p->comm, p->pid, success?"succeeded":"failed"), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, success, success) ), TPRAWFMT("task %d success=%d") ); This creates us a unique struct of: struct { pid_t pid; int success; }; And the way the call back would assign these values would be: entry->pid = p->pid; entry->success = success; The nice part about this is that the creation of the assignent is done via macro magic in the event tracer. Once the TRACE_EVENT_FORMAT is created, the developer will then have a faster method to record into the ring buffer. They do not need to worry about the tracer itself. The developer would only need to touch the files in include/trace/*.h Again, I would like to give special thanks to Tom Zanussi for this nice idea. Idea-from: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/events.c | 6 +- kernel/trace/trace.h | 19 ++++ kernel/trace/trace_events.c | 2 +- kernel/trace/trace_events.h | 57 ---------- kernel/trace/trace_events_stage_1.h | 34 ++++++ kernel/trace/trace_events_stage_2.h | 72 ++++++++++++ kernel/trace/trace_events_stage_3.h | 219 ++++++++++++++++++++++++++++++++++++ 7 files changed, 350 insertions(+), 59 deletions(-) delete mode 100644 kernel/trace/trace_events.h create mode 100644 kernel/trace/trace_events_stage_1.h create mode 100644 kernel/trace/trace_events_stage_2.h create mode 100644 kernel/trace/trace_events_stage_3.h diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 4e4e45860c58..f2509cbaacea 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -8,6 +8,10 @@ #include -#include "trace_events.h" +#include "trace_output.h" + +#include "trace_events_stage_1.h" +#include "trace_events_stage_2.h" +#include "trace_events_stage_3.h" #include diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index adf161f6dd11..aa1ab0cb80ab 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -726,4 +726,23 @@ static inline void trace_branch_disable(void) } #endif /* CONFIG_BRANCH_TRACER */ +struct ftrace_event_call { + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + struct dentry *raw_dir; + int raw_enabled; + int (*raw_init)(void); + int (*raw_reg)(void); + void (*raw_unreg)(void); +}; + +void event_trace_printk(unsigned long ip, const char *fmt, ...); +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b811eb343522..77a5c02bd634 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -10,7 +10,7 @@ #include #include -#include "trace_events.h" +#include "trace.h" #define TRACE_SYSTEM "TRACE_SYSTEM" diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h deleted file mode 100644 index b015d7b19878..000000000000 --- a/kernel/trace/trace_events.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef _LINUX_KERNEL_TRACE_EVENTS_H -#define _LINUX_KERNEL_TRACE_EVENTS_H - -#include -#include -#include "trace.h" - -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); -}; - - -#undef TPFMT -#define TPFMT(fmt, args...) fmt "\n", ##args - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (!ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = STR(TRACE_SYSTEM), \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ -} - -void event_trace_printk(unsigned long ip, const char *fmt, ...); -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */ diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h new file mode 100644 index 000000000000..fd3bf9382d37 --- /dev/null +++ b/kernel/trace/trace_events_stage_1.h @@ -0,0 +1,34 @@ +/* + * Stage 1 of the trace events. + * + * Override the macros in to include the following: + * + * struct ftrace_raw_ { + * struct trace_entry ent; + * ; + * [...] + * }; + * + * The is created by the TRACE_FIELD(type, item, assign) + * macro. We simply do "type item;", and that will create the fields + * in the structure. + */ + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt) \ + struct ftrace_raw_##name { \ + struct trace_entry ent; \ + tstruct \ + }; \ + static struct ftrace_event_call event_##name + +#undef TRACE_STRUCT +#define TRACE_STRUCT(args...) args + +#define TRACE_FIELD(type, item, assign) \ + type item; + +#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h new file mode 100644 index 000000000000..3eaaef5f19e1 --- /dev/null +++ b/kernel/trace/trace_events_stage_2.h @@ -0,0 +1,72 @@ +/* + * Stage 2 of the trace events. + * + * Override the macros in to include the following: + * + * enum print_line_t + * ftrace_raw_output_(struct trace_iterator *iter, int flags) + * { + * struct trace_seq *s = &iter->seq; + * struct ftrace_raw_ *field; <-- defined in stage 1 + * struct trace_entry *entry; + * int ret; + * + * entry = iter->ent; + * + * if (entry->type != event_.id) { + * WARN_ON_ONCE(1); + * return TRACE_TYPE_UNHANDLED; + * } + * + * field = (typeof(field))entry; + * + * ret = trace_seq_printf(s, "%s", "\n"); + * if (!ret) + * return TRACE_TYPE_PARTIAL_LINE; + * + * return TRACE_TYPE_HANDLED; + * } + * + * This is the method used to print the raw event to the trace + * output format. Note, this is not needed if the data is read + * in binary. + */ + +#undef TRACE_STRUCT +#define TRACE_STRUCT(args...) args + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + field->item, + + +#undef TPRAWFMT +#define TPRAWFMT(args...) args + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +enum print_line_t \ +ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ +{ \ + struct trace_seq *s = &iter->seq; \ + struct ftrace_raw_##call *field; \ + struct trace_entry *entry; \ + int ret; \ + \ + entry = iter->ent; \ + \ + if (entry->type != event_##call.id) { \ + WARN_ON_ONCE(1); \ + return TRACE_TYPE_UNHANDLED; \ + } \ + \ + field = (typeof(field))entry; \ + \ + ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n"); \ + if (!ret) \ + return TRACE_TYPE_PARTIAL_LINE; \ + \ + return TRACE_TYPE_HANDLED; \ +} + +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h new file mode 100644 index 000000000000..7a161c49deb4 --- /dev/null +++ b/kernel/trace/trace_events_stage_3.h @@ -0,0 +1,219 @@ +/* + * Stage 3 of the trace events. + * + * Override the macros in to include the following: + * + * static void ftrace_event_(proto) + * { + * event_trace_printk(_RET_IP_, "() " ); + * } + * + * static int ftrace_reg_event_(void) + * { + * int ret; + * + * ret = register_trace_(ftrace_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; + * } + * + * static void ftrace_unreg_event_(void) + * { + * unregister_trace_(ftrace_event_); + * } + * + * For those macros defined with TRACE_FORMAT: + * + * static struct ftrace_event_call __used + * __attribute__((__aligned__(4))) + * __attribute__((section("_ftrace_events"))) event_ = { + * .name = "", + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, + * } + * + * + * For those macros defined with TRACE_EVENT_FORMAT: + * + * static struct ftrace_event_call event_; + * + * static void ftrace_raw_event_(proto) + * { + * struct ring_buffer_event *event; + * struct ftrace_raw_ *entry; <-- defined in stage 1 + * unsigned long irq_flags; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * event = trace_current_buffer_lock_reserve(event_.id, + * sizeof(struct ftrace_raw_), + * irq_flags, pc); + * if (!event) + * return; + * entry = ring_buffer_event_data(event); + * + * ; <-- Here we assign the entries by the TRACE_FIELD. + * + * trace_current_buffer_unlock_commit(event, irq_flags, pc); + * } + * + * static int ftrace_raw_reg_event_(void) + * { + * int ret; + * + * ret = register_trace_(ftrace_raw_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; + * } + * + * static void ftrace_unreg_event_(void) + * { + * unregister_trace_(ftrace_raw_event_); + * } + * + * static struct trace_event ftrace_event_type_ = { + * .trace = ftrace_raw_output_, <-- stage 2 + * }; + * + * static int ftrace_raw_init_event_(void) + * { + * int id; + * + * id = register_ftrace_event(&ftrace_event_type_); + * if (!id) + * return -ENODEV; + * event_.id = id; + * return 0; + * } + * + * static struct ftrace_event_call __used + * __attribute__((__aligned__(4))) + * __attribute__((section("_ftrace_events"))) event_ = { + * .name = "", + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, + * .raw_init = ftrace_raw_init_event_, + * .raw_reg = ftrace_raw_reg_event_, + * .raw_unreg = ftrace_raw_unreg_event_, + * } + * + */ + +#undef TPFMT +#define TPFMT(fmt, args...) fmt "\n", ##args + +#define _TRACE_FORMAT(call, proto, args, fmt) \ +static void ftrace_event_##call(proto) \ +{ \ + event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ +} \ + \ +static int ftrace_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_event_##call); \ + if (!ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call); \ + return ret; \ +} \ + \ +static void ftrace_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_event_##call); \ +} \ + + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) \ +_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ +static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .system = STR(TRACE_SYSTEM), \ + .regfunc = ftrace_reg_event_##call, \ + .unregfunc = ftrace_unreg_event_##call, \ +} + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign)\ + entry->item = assign; + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ + \ +static struct ftrace_event_call event_##call; \ + \ +static void ftrace_raw_event_##call(proto) \ +{ \ + struct ring_buffer_event *event; \ + struct ftrace_raw_##call *entry; \ + unsigned long irq_flags; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + event = trace_current_buffer_lock_reserve(event_##call.id, \ + sizeof(struct ftrace_raw_##call), \ + irq_flags, pc); \ + if (!event) \ + return; \ + entry = ring_buffer_event_data(event); \ + \ + tstruct; \ + \ + trace_current_buffer_unlock_commit(event, irq_flags, pc); \ +} \ + \ +static int ftrace_raw_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_raw_event_##call); \ + if (!ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call); \ + return ret; \ +} \ + \ +static void ftrace_raw_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_raw_event_##call); \ +} \ + \ +static struct trace_event ftrace_event_type_##call = { \ + .trace = ftrace_raw_output_##call, \ +}; \ + \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + int id; \ + \ + id = register_ftrace_event(&ftrace_event_type_##call); \ + if (!id) \ + return -ENODEV; \ + event_##call.id = id; \ + return 0; \ +} \ + \ +static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .system = STR(TRACE_SYSTEM), \ + .regfunc = ftrace_reg_event_##call, \ + .unregfunc = ftrace_unreg_event_##call, \ + .raw_init = ftrace_raw_init_event_##call, \ + .raw_reg = ftrace_raw_reg_event_##call, \ + .raw_unreg = ftrace_raw_unreg_event_##call, \ +} -- cgit v1.2.3-59-g8ed1b From fd99498989f3b3feeab89dcadf537138ba136d24 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 28 Feb 2009 02:41:25 -0500 Subject: tracing: add raw fast tracing interface for trace events This patch adds the interface to enable the C style trace points. In the directory /debugfs/tracing/events/subsystem/event We now have three files: enable : values 0 or 1 to enable or disable the trace event. available_types: values 'raw' and 'printf' which indicate the tracing types available for the trace point. If a developer does not use the TRACE_EVENT_FORMAT macro and just uses the TRACE_FORMAT macro, then only 'printf' will be available. This file is read only. type: values 'raw' or 'printf'. This indicates which type of tracing is active for that trace point. 'printf' is the default and if 'raw' is not available, this file is read only. # echo raw > /debug/tracing/events/sched/sched_wakeup/type # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable Will enable the C style tracing for the sched_wakeup trace point. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 7 ++ kernel/trace/trace_events.c | 199 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 181 insertions(+), 25 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index aa1ab0cb80ab..f6fa0b9f83a8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -726,6 +726,12 @@ static inline void trace_branch_disable(void) } #endif /* CONFIG_BRANCH_TRACER */ +/* trace event type bit fields, not numeric */ +enum { + TRACE_EVENT_TYPE_PRINTF = 1, + TRACE_EVENT_TYPE_RAW = 2, +}; + struct ftrace_event_call { char *name; char *system; @@ -736,6 +742,7 @@ struct ftrace_event_call { int id; struct dentry *raw_dir; int raw_enabled; + int type; int (*raw_init)(void); int (*raw_reg)(void); void (*raw_unreg)(void); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 77a5c02bd634..1d07f800a9ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -44,6 +44,36 @@ static void ftrace_clear_events(void) } } +static void ftrace_event_enable_disable(struct ftrace_event_call *call, + int enable) +{ + + switch (enable) { + case 0: + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->raw_enabled) { + call->raw_enabled = 0; + call->raw_unreg(); + } + break; + case 1: + if (!call->enabled && + (call->type & TRACE_EVENT_TYPE_PRINTF)) { + call->enabled = 1; + call->regfunc(); + } + if (!call->raw_enabled && + (call->type & TRACE_EVENT_TYPE_RAW)) { + call->raw_enabled = 1; + call->raw_reg(); + } + break; + } +} + static int ftrace_set_clr_event(char *buf, int set) { struct ftrace_event_call *call = __start_ftrace_events; @@ -90,19 +120,8 @@ static int ftrace_set_clr_event(char *buf, int set) if (event && strcmp(event, call->name) != 0) continue; - if (set) { - /* Already set? */ - if (call->enabled) - return 0; - call->enabled = 1; - call->regfunc(); - } else { - /* Already cleared? */ - if (!call->enabled) - return 0; - call->enabled = 0; - call->unregfunc(); - } + ftrace_event_enable_disable(call, set); + ret = 0; } return ret; @@ -273,7 +292,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call = filp->private_data; char *buf; - if (call->enabled) + if (call->enabled || call->raw_enabled) buf = "1\n"; else buf = "0\n"; @@ -304,18 +323,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: - if (!call->enabled) - break; - - call->enabled = 0; - call->unregfunc(); - break; case 1: - if (call->enabled) - break; - - call->enabled = 1; - call->regfunc(); + ftrace_event_enable_disable(call, val); break; default: @@ -327,6 +336,107 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +event_type_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[16]; + int r = 0; + + if (call->type & TRACE_EVENT_TYPE_PRINTF) + r += sprintf(buf, "printf\n"); + + if (call->type & TRACE_EVENT_TYPE_RAW) + r += sprintf(buf+r, "raw\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +event_type_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64]; + + /* + * If there's only one type, we can't change it. + * And currently we always have printf type, and we + * may or may not have raw type. + * + * This is a redundant check, the file should be read + * only if this is the case anyway. + */ + + if (!call->raw_init) + return -EPERM; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (!strncmp(buf, "printf", 6) && + (!buf[6] || isspace(buf[6]))) { + + call->type = TRACE_EVENT_TYPE_PRINTF; + + /* + * If raw enabled, the disable it and enable + * printf type. + */ + if (call->raw_enabled) { + call->raw_enabled = 0; + call->raw_unreg(); + + call->enabled = 1; + call->regfunc(); + } + + } else if (!strncmp(buf, "raw", 3) && + (!buf[3] || isspace(buf[3]))) { + + call->type = TRACE_EVENT_TYPE_RAW; + + /* + * If printf enabled, the disable it and enable + * raw type. + */ + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + + call->raw_enabled = 1; + call->raw_reg(); + } + } else + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[16]; + int r = 0; + + r += sprintf(buf, "printf\n"); + + if (call->raw_init) + r += sprintf(buf+r, "raw\n"); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -362,6 +472,17 @@ static const struct file_operations ftrace_enable_fops = { .write = event_enable_write, }; +static const struct file_operations ftrace_type_fops = { + .open = tracing_open_generic, + .read = event_type_read, + .write = event_type_write, +}; + +static const struct file_operations ftrace_available_types_fops = { + .open = tracing_open_generic, + .read = event_available_types_read, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -427,6 +548,7 @@ static int event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) { struct dentry *entry; + int ret; /* * If the trace point header did not define TRACE_SYSTEM @@ -435,6 +557,18 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) if (strcmp(call->system, "TRACE_SYSTEM") != 0) d_events = event_subsystem_dir(call->system, d_events); + if (call->raw_init) { + ret = call->raw_init(); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } + } + + /* default the output to printf */ + call->type = TRACE_EVENT_TYPE_PRINTF; + call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " @@ -448,6 +582,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) pr_warning("Could not create debugfs " "'%s/enable' entry\n", call->name); + /* Only let type be writable, if we can change it */ + entry = debugfs_create_file("type", + call->raw_init ? 0644 : 0444, + call->dir, call, + &ftrace_type_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/type' entry\n", call->name); + + entry = debugfs_create_file("available_types", 0444, call->dir, call, + &ftrace_available_types_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/type' available_types\n", call->name); + return 0; } -- cgit v1.2.3-59-g8ed1b From 629928041c53771f9902753d50fef6b35f36d33d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 28 Feb 2009 02:47:59 -0500 Subject: tracing: create the C style tracing for the sched subsystem This patch utilizes the TRACE_EVENT_FORMAT macro to enable the C style faster tracing for the sched subsystem trace points. Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 3 + include/trace/sched_event_types.h | 119 ++++++++++++++++++++++++++++++-------- 2 files changed, 97 insertions(+), 25 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 62d13391a240..152b2f03fb86 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -157,4 +157,7 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt) \ + TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt)) + #endif diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index 2ada206565a3..ba059c10b58a 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -1,6 +1,6 @@ /* use instead */ -#ifndef TRACE_FORMAT +#ifndef TRACE_EVENT_FORMAT # error Do not include this file directly. # error Unless you know what you are doing. #endif @@ -8,70 +8,139 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched -TRACE_FORMAT(sched_kthread_stop, +TRACE_EVENT_FORMAT(sched_kthread_stop, TPPROTO(struct task_struct *t), TPARGS(t), - TPFMT("task %s:%d", t->comm, t->pid)); + TPFMT("task %s:%d", t->comm, t->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, t->pid) + ), + TPRAWFMT("task %d") + ); -TRACE_FORMAT(sched_kthread_stop_ret, +TRACE_EVENT_FORMAT(sched_kthread_stop_ret, TPPROTO(int ret), TPARGS(ret), - TPFMT("ret=%d", ret)); + TPFMT("ret=%d", ret), + TRACE_STRUCT( + TRACE_FIELD(int, ret, ret) + ), + TPRAWFMT("ret=%d") + ); -TRACE_FORMAT(sched_wait_task, +TRACE_EVENT_FORMAT(sched_wait_task, TPPROTO(struct rq *rq, struct task_struct *p), TPARGS(rq, p), - TPFMT("task %s:%d", p->comm, p->pid)); + TPFMT("task %s:%d", p->comm, p->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + ), + TPRAWFMT("task %d") + ); -TRACE_FORMAT(sched_wakeup, +TRACE_EVENT_FORMAT(sched_wakeup, TPPROTO(struct rq *rq, struct task_struct *p, int success), TPARGS(rq, p, success), TPFMT("task %s:%d %s", - p->comm, p->pid, success?"succeeded":"failed")); + p->comm, p->pid, success ? "succeeded" : "failed"), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + TRACE_FIELD(int, success, success) + ), + TPRAWFMT("task %d success=%d") + ); -TRACE_FORMAT(sched_wakeup_new, +TRACE_EVENT_FORMAT(sched_wakeup_new, TPPROTO(struct rq *rq, struct task_struct *p, int success), TPARGS(rq, p, success), TPFMT("task %s:%d", - p->comm, p->pid, success?"succeeded":"failed")); + p->comm, p->pid, success ? "succeeded" : "failed"), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + TRACE_FIELD(int, success, success) + ), + TPRAWFMT("task %d success=%d") + ); -TRACE_FORMAT(sched_switch, +TRACE_EVENT_FORMAT(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TPARGS(rq, prev, next), TPFMT("task %s:%d ==> %s:%d", - prev->comm, prev->pid, next->comm, next->pid)); + prev->comm, prev->pid, next->comm, next->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, prev_pid, prev->pid) + TRACE_FIELD(int, prev_prio, prev->prio) + TRACE_FIELD(pid_t, next_pid, next->pid) + TRACE_FIELD(int, next_prio, next->prio) + ), + TPRAWFMT("prev %d:%d ==> next %d:%d") + ); -TRACE_FORMAT(sched_migrate_task, +TRACE_EVENT_FORMAT(sched_migrate_task, TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), TPARGS(p, orig_cpu, dest_cpu), TPFMT("task %s:%d from: %d to: %d", - p->comm, p->pid, orig_cpu, dest_cpu)); + p->comm, p->pid, orig_cpu, dest_cpu), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + TRACE_FIELD(int, orig_cpu, orig_cpu) + TRACE_FIELD(int, dest_cpu, dest_cpu) + ), + TPRAWFMT("task %d from: %d to: %d") + ); -TRACE_FORMAT(sched_process_free, +TRACE_EVENT_FORMAT(sched_process_free, TPPROTO(struct task_struct *p), TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid)); + TPFMT("task %s:%d", p->comm, p->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + ), + TPRAWFMT("task %d") + ); -TRACE_FORMAT(sched_process_exit, +TRACE_EVENT_FORMAT(sched_process_exit, TPPROTO(struct task_struct *p), TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid)); + TPFMT("task %s:%d", p->comm, p->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, p->pid) + ), + TPRAWFMT("task %d") + ); -TRACE_FORMAT(sched_process_wait, +TRACE_EVENT_FORMAT(sched_process_wait, TPPROTO(struct pid *pid), TPARGS(pid), - TPFMT("pid %d", pid)); + TPFMT("pid %d", pid_nr(pid)), + TRACE_STRUCT( + TRACE_FIELD(pid_t, pid, pid_nr(pid)) + ), + TPRAWFMT("task %d") + ); -TRACE_FORMAT(sched_process_fork, +TRACE_EVENT_FORMAT(sched_process_fork, TPPROTO(struct task_struct *parent, struct task_struct *child), TPARGS(parent, child), TPFMT("parent %s:%d child %s:%d", - parent->comm, parent->pid, child->comm, child->pid)); + parent->comm, parent->pid, child->comm, child->pid), + TRACE_STRUCT( + TRACE_FIELD(pid_t, parent, parent->pid) + TRACE_FIELD(pid_t, child, child->pid) + ), + TPRAWFMT("parent %d child %d") + ); -TRACE_FORMAT(sched_signal_send, +TRACE_EVENT_FORMAT(sched_signal_send, TPPROTO(int sig, struct task_struct *p), TPARGS(sig, p), - TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid)); + TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid), + TRACE_STRUCT( + TRACE_FIELD(int, sig, sig) + TRACE_FIELD(pid_t, pid, p->pid) + ), + TPRAWFMT("sig: %d task %d") + ); #undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From f2034f1e1adaac6713a6d48b5a2d4f3aa3e63ccb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 28 Feb 2009 02:54:39 -0500 Subject: tracing: create the C style tracing for the irq subsystem This patch utilizes the TRACE_EVENT_FORMAT macro to enable the C style faster tracing for the irq subsystem trace points. Signed-off-by: Steven Rostedt --- include/trace/irq_event_types.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 47a2be1b2d7d..65850bc5ea06 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -8,15 +8,26 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -TRACE_FORMAT(irq_handler_entry, +TRACE_EVENT_FORMAT(irq_handler_entry, TPPROTO(int irq, struct irqaction *action), TPARGS(irq, action), - TPFMT("irq=%d handler=%s", irq, action->name)); + TPFMT("irq=%d handler=%s", irq, action->name), + TRACE_STRUCT( + TRACE_FIELD(int, irq, irq) + ), + TPRAWFMT("irq %d") + ); -TRACE_FORMAT(irq_handler_exit, +TRACE_EVENT_FORMAT(irq_handler_exit, TPPROTO(int irq, struct irqaction *action, int ret), TPARGS(irq, action, ret), TPFMT("irq=%d handler=%s return=%s", - irq, action->name, ret ? "handled" : "unhandled")); + irq, action->name, ret ? "handled" : "unhandled"), + TRACE_STRUCT( + TRACE_FIELD(int, irq, irq) + TRACE_FIELD(int, ret, ret) + ), + TPRAWFMT("irq %d ret %d") + ); #undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From 3e8ebb5c433f016dff5824587436642d87fc2e6c Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Sun, 1 Mar 2009 20:41:41 -0500 Subject: debug_objects: add boot-parameter toggle to turn object debugging off again While trying to debug why my Atom netbook is falling over booting rawhide debug-enabled kernels, I stumbled across the fact that we've been enabling object debugging by default. However, once you default it to on, you've got no way to turn it back off again at runtime. Add a boolean toggle to turn it off. I would just make it an int module_param, however people may already expect the boolean enable behaviour, so just add an analogue for disabling. Signed-off-by: Kyle McMartin Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 +++ lib/debugobjects.c | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 54f21a5c262b..1a328f33e1df 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -604,6 +604,9 @@ and is between 256 and 4096 characters. It is defined in the file debug_objects [KNL] Enable object debugging + no_debug_objects + [KNL] Disable object debugging + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 5d99be1fd988..90e46fa12721 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -55,7 +55,15 @@ static int __init enable_object_debug(char *str) debug_objects_enabled = 1; return 0; } + +static int __init disable_object_debug(char *str) +{ + debug_objects_enabled = 0; + return 0; +} + early_param("debug_objects", enable_object_debug); +early_param("no_debug_objects", disable_object_debug); static const char *obj_states[ODEBUG_STATE_MAX] = { [ODEBUG_STATE_NONE] = "none", -- cgit v1.2.3-59-g8ed1b From d20e3b03842bfeb9d21817ff19054c277cc3eac0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 10:53:15 -0500 Subject: tracing: add TRACE_FIELD_SPECIAL to record complex entries Tom Zanussi pointed out that the simple TRACE_FIELD was not enough to record trace data that required memcpy. This patch addresses this issue by adding a TRACE_FIELD_SPECIAL. The format is similar to TRACE_FIELD but looks like so: TRACE_FIELD_SPECIAL(type_item, item, cmd) What TRACE_FIELD gave was: TRACE_FIELD(type, item, assign) The TRACE_FIELD would be used in declaring a structure: struct { type item; }; And later assign it via: entry->item = assign; What TRACE_FIELD_SPECIAL gives us is: In the declaration of the structure: struct { type_item; }; And the assignment: cmd; This change log will explain the one example used in the patch: TRACE_EVENT_FORMAT(sched_switch, TPPROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TPARGS(rq, prev, next), TPFMT("task %s:%d ==> %s:%d", prev->comm, prev->pid, next->comm, next->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, prev_pid, prev->pid) TRACE_FIELD(int, prev_prio, prev->prio) TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], next_comm, TPCMD(memcpy(TRACE_ENTRY->next_comm, next->comm, TASK_COMM_LEN))) TRACE_FIELD(pid_t, next_pid, next->pid) TRACE_FIELD(int, next_prio, next->prio) ), TPRAWFMT("prev %d:%d ==> next %s:%d:%d") ); The struct will be create as: struct { pid_t prev_pid; int prev_prio; char next_comm[TASK_COMM_LEN]; pid_t next_pid; int next_prio; }; Note the TRACE_ENTRY in the cmd part of TRACE_SPECIAL. TRACE_ENTRY will be set by the tracer to point to the structure inside the trace buffer. entry->prev_pid = prev->pid; entry->prev_prio = prev->prio; memcpy(entry->next_comm, next->comm, TASK_COMM_LEN); entry->next_pid = next->pid; entry->next_prio = next->prio Reported-by: Tom Zanussi Signed-off-by: Steven Rostedt --- include/trace/sched_event_types.h | 7 ++++++- kernel/trace/trace_events_stage_1.h | 2 ++ kernel/trace/trace_events_stage_2.h | 4 ++++ kernel/trace/trace_events_stage_3.h | 14 ++++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index ba059c10b58a..a6de5c1601a0 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -71,10 +71,15 @@ TRACE_EVENT_FORMAT(sched_switch, TRACE_STRUCT( TRACE_FIELD(pid_t, prev_pid, prev->pid) TRACE_FIELD(int, prev_prio, prev->prio) + TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], + next_comm, + TPCMD(memcpy(TRACE_ENTRY->next_comm, + next->comm, + TASK_COMM_LEN))) TRACE_FIELD(pid_t, next_pid, next->pid) TRACE_FIELD(int, next_prio, next->prio) ), - TPRAWFMT("prev %d:%d ==> next %d:%d") + TPRAWFMT("prev %d:%d ==> next %s:%d:%d") ); TRACE_EVENT_FORMAT(sched_migrate_task, diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index fd3bf9382d37..3830a731424c 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -30,5 +30,7 @@ #define TRACE_FIELD(type, item, assign) \ type item; +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + type_item; #include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 3eaaef5f19e1..dc79fe3a2ecb 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -39,6 +39,10 @@ #define TRACE_FIELD(type, item, assign) \ field->item, +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + field->item, + #undef TPRAWFMT #define TPRAWFMT(args...) args diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 7a161c49deb4..2ab65e958223 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -147,6 +147,20 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign)\ + entry->item = assign; + +#undef TPCMD +#define TPCMD(cmd...) cmd + +#undef TRACE_ENTRY +#define TRACE_ENTRY entry + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + cmd; + #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -- cgit v1.2.3-59-g8ed1b From 11a241a3302277db05561e01477528629d806c4e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 11:49:04 -0500 Subject: tracing: add protection around modify trace event fields The trace event objects are currently not proctected against reentrancy. This patch adds a mutex around the modifications of the trace event fields. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1d07f800a9ce..26069fa6b3b0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -14,6 +14,8 @@ #define TRACE_SYSTEM "TRACE_SYSTEM" +static DEFINE_MUTEX(event_mutex); + #define events_for_each(event) \ for (event = __start_ftrace_events; \ (unsigned long)event < (unsigned long)__stop_ftrace_events; \ @@ -104,6 +106,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } + mutex_lock(&event_mutex); events_for_each(call) { if (!call->name) @@ -124,6 +127,8 @@ static int ftrace_set_clr_event(char *buf, int set) ret = 0; } + mutex_unlock(&event_mutex); + return ret; } @@ -324,7 +329,9 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: case 1: + mutex_lock(&event_mutex); ftrace_event_enable_disable(call, val); + mutex_unlock(&event_mutex); break; default: -- cgit v1.2.3-59-g8ed1b From f9520750c4c9924c14325cd951efae5fae58104c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 14:04:40 -0500 Subject: tracing: make trace_seq_reset global and rename to trace_seq_init Impact: clean up The trace_seq functions may be used separately outside of the ftrace iterator. The trace_seq_reset is needed for these operations. This patch also renames trace_seq_reset to the more appropriate trace_seq_init. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 17 +++++------------ kernel/trace/trace.h | 8 ++++++++ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c5e39cd7310d..ea055aa21cd9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -342,13 +342,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(tsk); } -static void -trace_seq_reset(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) { int len; @@ -395,7 +388,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) s->buffer[len] = 0; seq_puts(m, s->buffer); - trace_seq_reset(s); + trace_seq_init(s); } /** @@ -2620,7 +2613,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, if (sret != -EBUSY) return sret; - trace_seq_reset(&iter->seq); + trace_seq_init(&iter->seq); /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); @@ -2682,7 +2675,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= iter->seq.len) - trace_seq_reset(&iter->seq); + trace_seq_init(&iter->seq); /* * If there was nothing to send to user, inspite of consuming trace @@ -2819,7 +2812,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, partial[i].offset = 0; partial[i].len = iter->seq.len; - trace_seq_reset(&iter->seq); + trace_seq_init(&iter->seq); } mutex_unlock(&iter->mutex); @@ -3631,7 +3624,7 @@ trace_printk_seq(struct trace_seq *s) printk(KERN_TRACE "%s", s->buffer); - trace_seq_reset(s); + trace_seq_init(s); } void ftrace_dump(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f6fa0b9f83a8..cf6ba4181b14 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -395,6 +395,14 @@ struct trace_seq { unsigned int readpos; }; +static inline void +trace_seq_init(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + + #define TRACE_PIPE_ALL_CPU -1 /* -- cgit v1.2.3-59-g8ed1b From 981d081ec8b958b7d962ee40d433581a55d40fc5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 13:53:59 -0500 Subject: tracing: add format file to describe event struct fields This patch adds the "format" file to the trace point event directory. This is based off of work by Tom Zanussi, in which a file is exported to be tread from user land such that a user space app may read the binary record stored in the ring buffer. # cat /debug/tracing/events/sched/sched_switch/format field:pid_t prev_pid; offset:12; size:4; field:int prev_prio; offset:16; size:4; field special:char next_comm[TASK_COMM_LEN]; offset:20; size:16; field:pid_t next_pid; offset:36; size:4; field:int next_prio; offset:40; size:4; Idea-from: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 56 ++++++++++++++++++++++++++++++++++++- kernel/trace/trace_events_stage_2.h | 52 ++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_stage_3.h | 2 ++ 4 files changed, 110 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cf6ba4181b14..e606633fb498 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -754,6 +754,7 @@ struct ftrace_event_call { int (*raw_init)(void); int (*raw_reg)(void); void (*raw_unreg)(void); + int (*show_format)(struct trace_seq *s); }; void event_trace_printk(unsigned long ip, const char *fmt, ...); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 26069fa6b3b0..d57a772981c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3,6 +3,9 @@ * * Copyright (C) 2008 Red Hat Inc, Steven Rostedt * + * - Added format output of fields of the trace point. + * This was based off of work by Tom Zanussi . + * */ #include @@ -444,6 +447,42 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +static ssize_t +event_format_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + char *buf; + int r; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + if (*ppos) + return 0; + + r = call->show_format(s); + if (!r) { + /* + * ug! The format output is bigger than a PAGE!! + */ + buf = "FORMAT TOO BIG\n"; + r = simple_read_from_buffer(ubuf, cnt, ppos, + buf, strlen(buf)); + goto out; + } + + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, s->len); + out: + kfree(s); + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -490,6 +529,11 @@ static const struct file_operations ftrace_available_types_fops = { .read = event_available_types_read, }; +static const struct file_operations ftrace_event_format_fops = { + .open = tracing_open_generic, + .read = event_format_read, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -602,7 +646,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) &ftrace_available_types_fops); if (!entry) pr_warning("Could not create debugfs " - "'%s/type' available_types\n", call->name); + "'%s/available_types' entry\n", call->name); + + /* A trace may not want to export its format */ + if (!call->show_format) + return 0; + + entry = debugfs_create_file("format", 0444, call->dir, call, + &ftrace_event_format_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/format' entry\n", call->name); return 0; } diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index dc79fe3a2ecb..3a80ea4e92cb 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -74,3 +74,55 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ } #include + +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " size:%d; offset:%d;\n", + * sizeof(field.type), + * offsetof(struct ftrace_raw_##call, + * item)); + * + * } + */ + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%lu;\tsize:%lu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; + + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ + "offset:%lu;\tsize:%lu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + return ret; \ +} + +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 2ab65e958223..c62a4d2a5283 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -101,6 +101,7 @@ * .raw_init = ftrace_raw_init_event_, * .raw_reg = ftrace_raw_reg_event_, * .raw_unreg = ftrace_raw_unreg_event_, + * .show_format = ftrace_format_, * } * */ @@ -230,4 +231,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = ftrace_raw_init_event_##call, \ .raw_reg = ftrace_raw_reg_event_##call, \ .raw_unreg = ftrace_raw_unreg_event_##call, \ + .show_format = ftrace_format_##call, \ } -- cgit v1.2.3-59-g8ed1b From 91729ef96661bfa7dc53923746cd90b62d5495cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 15:03:01 -0500 Subject: tracing: add ftrace headers to event format files This patch includes the ftrace header to the event formats files: # cat /debug/tracing/events/sched/sched_switch/format field:unsigned char type; offset:0; size:1; field:unsigned char flags; offset:1; size:1; field:unsigned char preempt_count; offset:2; size:1; field:int pid; offset:4; size:4; field:int tgid; offset:8; size:4; field:pid_t prev_pid; offset:12; size:4; field:int prev_prio; offset:16; size:4; field special:char next_comm[TASK_COMM_LEN]; offset:20; size:16; field:pid_t next_pid; offset:36; size:4; field:int next_prio; offset:40; size:4; A blank line is used as a deliminator between the ftrace header and the trace point fields. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d57a772981c1..cdcc3aed76fd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -13,7 +13,7 @@ #include #include -#include "trace.h" +#include "trace_output.h" #define TRACE_SYSTEM "TRACE_SYSTEM" @@ -447,6 +447,28 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } +#undef FIELD +#define FIELD(type, name) \ + #type, #name, offsetof(typeof(field), name), sizeof(field.name) + +static int trace_write_header(struct trace_seq *s) +{ + struct trace_entry field; + + /* struct trace_entry */ + return trace_seq_printf(s, + "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\n", + FIELD(unsigned char, type), + FIELD(unsigned char, flags), + FIELD(unsigned char, preempt_count), + FIELD(int, pid), + FIELD(int, tgid)); +} static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -465,6 +487,9 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, if (*ppos) return 0; + /* If this fails, so will the show_format. */ + trace_write_header(s); + r = call->show_format(s); if (!r) { /* -- cgit v1.2.3-59-g8ed1b From c5e4e19271edfdf1abd4184933d40d646da6a091 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 15:10:02 -0500 Subject: tracing: add trace name and id to event formats To be able to identify the trace in the binary format output, the id of the trace event (which is dynamically assigned) must also be listed. This patch adds the name of the trace point as well as the id assigned. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index cdcc3aed76fd..210e71ff82db 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -487,7 +487,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, if (*ppos) return 0; - /* If this fails, so will the show_format. */ + /* If any of the first writes fail, so will the show_format. */ + + trace_seq_printf(s, "name: %s\n", call->name); + trace_seq_printf(s, "ID: %d\n", call->id); + trace_seq_printf(s, "format:\n"); trace_write_header(s); r = call->show_format(s); -- cgit v1.2.3-59-g8ed1b From 96ccd21cd13140221bda74a4fc4e53ffeba7c7d4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 15:22:21 -0500 Subject: tracing: add print format to event trace format files This patch adds the internal print format used to print the raw events to the event trace point format file. # cat /debug/tracing/events/sched/sched_switch/format name: sched_switch ID: 29 format: field:unsigned char type; offset:0; size:1; field:unsigned char flags; offset:1; size:1; field:unsigned char preempt_count; offset:2; size:1; field:int pid; offset:4; size:4; field:int tgid; offset:8; size:4; field:pid_t prev_pid; offset:12; size:4; field:int prev_prio; offset:16; size:4; field special:char next_comm[TASK_COMM_LEN]; offset:20; size:16; field:pid_t next_pid; offset:36; size:4; field:int next_prio; offset:40; size:4; print fmt: "prev %d:%d ==> next %s:%d:%d" Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_2.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 3a80ea4e92cb..b1cebba1d9b4 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -122,6 +122,8 @@ ftrace_format_##call(struct trace_seq *s) \ \ tstruct; \ \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ return ret; \ } -- cgit v1.2.3-59-g8ed1b From c79a61f55773d2519fd0525bf58385f7d20752d3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-Koenig Date: Fri, 27 Feb 2009 21:30:03 +0100 Subject: tracing: make CALLER_ADDRx overwriteable The current definition of CALLER_ADDRx isn't suitable for all platforms. E.g. for ARM __builtin_return_address(N) doesn't work for N > 0 and AFAIK for powerpc there are no frame pointers needed to have a working __builtin_return_address. This patch allows defining the CALLER_ADDRx macros in and let these take precedence. Because now is included unconditionally in all archs that don't already had this include get an empty one for free. Signed-off-by: Uwe Kleine-Koenig Cc: Peter Zijlstra Cc: Ingo Molnar Reviewed-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/ftrace.h | 1 + arch/avr32/include/asm/ftrace.h | 1 + arch/blackfin/include/asm/ftrace.h | 1 + arch/cris/include/asm/ftrace.h | 1 + arch/h8300/include/asm/ftrace.h | 1 + arch/m68k/include/asm/ftrace.h | 1 + arch/mips/include/asm/ftrace.h | 1 + arch/parisc/include/asm/ftrace.h | 1 + arch/um/include/asm/ftrace.h | 1 + arch/xtensa/include/asm/ftrace.h | 1 + include/asm-frv/ftrace.h | 1 + include/asm-m32r/ftrace.h | 1 + include/asm-mn10300/ftrace.h | 1 + include/linux/ftrace.h | 41 +++++++++++++++++++------------------- 14 files changed, 34 insertions(+), 20 deletions(-) create mode 100644 arch/alpha/include/asm/ftrace.h create mode 100644 arch/avr32/include/asm/ftrace.h create mode 100644 arch/blackfin/include/asm/ftrace.h create mode 100644 arch/cris/include/asm/ftrace.h create mode 100644 arch/h8300/include/asm/ftrace.h create mode 100644 arch/m68k/include/asm/ftrace.h create mode 100644 arch/mips/include/asm/ftrace.h create mode 100644 arch/parisc/include/asm/ftrace.h create mode 100644 arch/um/include/asm/ftrace.h create mode 100644 arch/xtensa/include/asm/ftrace.h create mode 100644 include/asm-frv/ftrace.h create mode 100644 include/asm-m32r/ftrace.h create mode 100644 include/asm-mn10300/ftrace.h diff --git a/arch/alpha/include/asm/ftrace.h b/arch/alpha/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/alpha/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/avr32/include/asm/ftrace.h b/arch/avr32/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/avr32/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/blackfin/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/cris/include/asm/ftrace.h b/arch/cris/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/cris/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/h8300/include/asm/ftrace.h b/arch/h8300/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/h8300/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/m68k/include/asm/ftrace.h b/arch/m68k/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/m68k/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/mips/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/parisc/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/um/include/asm/ftrace.h b/arch/um/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/um/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/arch/xtensa/include/asm/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-frv/ftrace.h b/include/asm-frv/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-frv/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-m32r/ftrace.h b/include/asm-m32r/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-m32r/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/asm-mn10300/ftrace.h b/include/asm-mn10300/ftrace.h new file mode 100644 index 000000000000..40a8c178f10d --- /dev/null +++ b/include/asm-mn10300/ftrace.h @@ -0,0 +1 @@ +/* empty */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 847bb3c48dd0..1f69ac7c1587 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -11,6 +11,8 @@ #include #include +#include + #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -103,8 +105,6 @@ struct ftrace_func_command { }; #ifdef CONFIG_DYNAMIC_FTRACE -/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */ -#include int ftrace_arch_code_modify_prepare(void); int ftrace_arch_code_modify_post_process(void); @@ -282,24 +282,25 @@ static inline void __ftrace_enabled_restore(int enabled) #endif } -#ifdef CONFIG_FRAME_POINTER -/* TODO: need to fix this for ARM */ -# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) -# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) -# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) -# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) -# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) -# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) -#else -# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -# define CALLER_ADDR1 0UL -# define CALLER_ADDR2 0UL -# define CALLER_ADDR3 0UL -# define CALLER_ADDR4 0UL -# define CALLER_ADDR5 0UL -# define CALLER_ADDR6 0UL -#endif +#ifndef HAVE_ARCH_CALLER_ADDR +# ifdef CONFIG_FRAME_POINTER +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1)) +# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2)) +# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3)) +# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4)) +# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5)) +# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6)) +# else +# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +# define CALLER_ADDR1 0UL +# define CALLER_ADDR2 0UL +# define CALLER_ADDR3 0UL +# define CALLER_ADDR4 0UL +# define CALLER_ADDR5 0UL +# define CALLER_ADDR6 0UL +# endif +#endif /* ifndef HAVE_ARCH_CALLER_ADDR */ #ifdef CONFIG_IRQSOFF_TRACER extern void time_hardirqs_on(unsigned long a0, unsigned long a1); -- cgit v1.2.3-59-g8ed1b From 633ddaa7f471e9db181f993c1458d6f4bae321ca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 09:43:50 -0500 Subject: tracing: fix return value to registering events The registering of events had the return value check backwards. A zero returned is success, the check had it as a failure. This patch also fixes a missing "\n" in the warning that the check failed. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_3.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index c62a4d2a5283..041789ffbac1 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -120,9 +120,9 @@ static int ftrace_reg_event_##call(void) \ int ret; \ \ ret = register_trace_##call(ftrace_event_##call); \ - if (!ret) \ + if (ret) \ pr_info("event trace: Could not activate trace point " \ - "probe to " #call); \ + "probe to " #call "\n"); \ return ret; \ } \ \ @@ -195,9 +195,9 @@ static int ftrace_raw_reg_event_##call(void) \ int ret; \ \ ret = register_trace_##call(ftrace_raw_event_##call); \ - if (!ret) \ + if (ret) \ pr_info("event trace: Could not activate trace point " \ - "probe to " #call); \ + "probe to " #call "\n"); \ return ret; \ } \ \ -- cgit v1.2.3-59-g8ed1b From 41be4da4e85e58520b934040966a6ae919c66c2d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 2 Mar 2009 20:56:48 -0500 Subject: ring-buffer: reset write field for ring_buffer_read_page Impact: fix ring_buffer_read_page After a page is swapped into the ring buffer, the write field must also be reset. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a8c275c01e83..9baad7ee4b36 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2492,6 +2492,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rb_init_page(bpage); bpage = cpu_buffer->reader_page->page; cpu_buffer->reader_page->page = *data_page; + local_set(&cpu_buffer->reader_page->write, 0); cpu_buffer->reader_page->read = 0; *data_page = bpage; } -- cgit v1.2.3-59-g8ed1b From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 00:27:49 -0500 Subject: ring-buffer: fix ring_buffer_read_page The ring_buffer_read_page was broken if it were to only copy part of the page. This patch fixes that up as well as adds a parameter to allow a length field, in order to only copy part of the buffer page. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 7 +++- kernel/trace/ring_buffer.c | 92 +++++++++++++++++++++++++++++---------------- 2 files changed, 64 insertions(+), 35 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f5e793d69bd3..79fcbc4b09d6 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +size_t ring_buffer_page_len(void *page); + + /* * The below functions are fine to use outside the tracing facility. */ @@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; } void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); -int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full); +int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, + size_t len, int cpu, int full); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9baad7ee4b36..2ad6bae95a3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit); +} + /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) { - unsigned long addr; struct buffer_data_page *bpage; + unsigned long addr; addr = __get_free_page(GFP_KERNEL); if (!addr) @@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) bpage = (void *)addr; + rb_init_page(bpage); + return bpage; } @@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * ring_buffer_read_page - extract a page from the ring buffer * @buffer: buffer to extract from * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract * @cpu: the cpu of the buffer to extract * @full: should the extraction only happen when the page is full. * @@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * rpage = ring_buffer_alloc_read_page(buffer); * if (!rpage) * return error; - * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); * @@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) * <0 if no data has been transferred. */ int ring_buffer_read_page(struct ring_buffer *buffer, - void **data_page, int cpu, int full) + void **data_page, size_t len, int cpu, int full) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct ring_buffer_event *event; struct buffer_data_page *bpage; + struct buffer_page *reader; unsigned long flags; + unsigned int commit; unsigned int read; int ret = -1; if (!data_page) - return 0; + return -1; bpage = *data_page; if (!bpage) - return 0; + return -1; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - /* - * rb_buffer_peek will get the next ring buffer if - * the current reader page is empty. - */ - event = rb_buffer_peek(buffer, cpu, NULL); - if (!event) + reader = rb_get_reader_page(cpu_buffer); + if (!reader) goto out; - /* check for data */ - if (!local_read(&cpu_buffer->reader_page->page->commit)) - goto out; + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); - read = cpu_buffer->reader_page->read; /* - * If the writer is already off of the read page, then simply - * switch the read page with the given page. Otherwise - * we need to copy the data from the reader to the writer. + * If len > what's left on the page, and the writer is also off of + * the read page, then simply switch the read page with the given + * page. Otherwise we need to copy the data from the reader to the + * writer. */ - if (cpu_buffer->reader_page == cpu_buffer->commit_page) { - unsigned int commit = rb_page_commit(cpu_buffer->reader_page); + if ((len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int pos = read; + unsigned int size; if (full) goto out; - /* The writer is still on the reader page, we must copy */ - memcpy(bpage->data + read, rpage->data + read, commit - read); - /* consume what was read */ - cpu_buffer->reader_page->read = commit; + if (len > (commit - read)) + len = (commit - read); + + size = rb_event_length(event); + + if (len < size) + goto out; + + /* Need to copy one event at a time */ + do { + memcpy(bpage->data + pos, rpage->data + pos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + pos = reader->read; + + event = rb_reader_event(cpu_buffer); + size = rb_event_length(event); + } while (len > size); /* update bpage */ - local_set(&bpage->commit, commit); - if (!read) - bpage->time_stamp = rpage->time_stamp; + local_set(&bpage->commit, pos); + bpage->time_stamp = rpage->time_stamp; + } else { /* swap the pages */ rb_init_page(bpage); - bpage = cpu_buffer->reader_page->page; - cpu_buffer->reader_page->page = *data_page; - local_set(&cpu_buffer->reader_page->write, 0); - cpu_buffer->reader_page->read = 0; + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + reader->read = 0; *data_page = bpage; + + /* update the entry counter */ + rb_remove_entries(cpu_buffer, bpage, read); } ret = read; - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); -- cgit v1.2.3-59-g8ed1b From e3d6bf0a0781a269f34250fd41e0d3dbfe540cf1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 13:53:07 -0500 Subject: ring-buffer: replace sizeof of event header with offsetof Impact: fix to possible alignment problems on some archs. Some arch compilers include an NULL char array in the sizeof field. Since the ring_buffer_event type includes one of these, it is better to use the "offsetof" instead, to avoid strange bugs on these archs. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2ad6bae95a3d..27cf834d8b4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -132,7 +132,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) } EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); -#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 -- cgit v1.2.3-59-g8ed1b From 474d32b68d6d842f3e710e9ae9fe2568c53339f8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 19:51:40 -0500 Subject: ring-buffer: make ring_buffer_read_page read from start on partial page Impact: dont leave holes in read buffer page The ring_buffer_read_page swaps a given page with the reader page of the ring buffer, if certain conditions are set: 1) requested length is big enough to hold entire page data 2) a writer is not currently on the page 3) the page is not partially consumed. Instead of swapping with the supplied page. It copies the data to the supplied page instead. But currently the data is copied in the same offset as the source page. This causes a hole at the start of the reader page. This complicates the use of this function. Instead, it should copy the data at the beginning of the function and update the index fields accordingly. Other small clean ups are also done in this patch. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 27cf834d8b4e..f2a163db52f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -61,6 +61,8 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + /** * tracing_on - enable all tracing buffers * @@ -234,9 +236,16 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit); + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; } /* @@ -259,7 +268,7 @@ static inline int test_time_stamp(u64 delta) return 0; } -#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) /* * head_page == tail_page && head == tail then buffer is empty. @@ -2454,6 +2463,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned int read; int ret = -1; + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + return -1; + + len -= BUF_PAGE_HDR_SIZE; + if (!data_page) return -1; @@ -2473,15 +2491,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, commit = rb_page_commit(reader); /* - * If len > what's left on the page, and the writer is also off of - * the read page, then simply switch the read page with the given - * page. Otherwise we need to copy the data from the reader to the - * writer. + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. */ - if ((len < (commit - read)) || + if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; - unsigned int pos = read; + unsigned int rpos = read; + unsigned int pos = 0; unsigned int size; if (full) @@ -2497,12 +2517,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { - memcpy(bpage->data + pos, rpage->data + pos, size); + memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; rb_advance_reader(cpu_buffer); - pos = reader->read; + rpos = reader->read; + pos += size; event = rb_reader_event(cpu_buffer); size = rb_event_length(event); @@ -2512,6 +2533,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&bpage->commit, pos); bpage->time_stamp = rpage->time_stamp; + /* we copied everything to the beginning */ + read = 0; } else { /* swap the pages */ rb_init_page(bpage); -- cgit v1.2.3-59-g8ed1b From 2cadf9135eb3b6d84b6427314be827ddd443c308 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Dec 2008 22:20:19 -0500 Subject: tracing: add binary buffer files for use with splice Impact: new feature This patch creates a directory of files that correspond to the per CPU ring buffers. These are binary files and are made to be used with splice. This is the fastest way to extract data from the ftrace ring buffers. Thanks to Jiaying Zhang for pushing me to get this code fixed, and to Eduard - Gabriel Munteanu for his splice code that helped me debug my code. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 1 + 2 files changed, 268 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ea055aa21cd9..12539f72f4a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -11,31 +11,30 @@ * Copyright (C) 2004-2006 Ingo Molnar * Copyright (C) 2004 William Lee Irwin III */ +#include #include +#include +#include #include #include #include +#include #include #include #include #include #include +#include #include #include #include +#include #include #include #include #include #include #include -#include -#include -#include - -#include -#include -#include #include "trace.h" #include "trace_output.h" @@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = { .write = tracing_mark_write, }; +struct ftrace_buffer_info { + struct trace_array *tr; + void *spare; + int cpu; + unsigned int read; +}; + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + int cpu = (int)(long)inode->i_private; + struct ftrace_buffer_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = cpu; + info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + if (!info->spare) + goto out; + + filp->private_data = info; + + return 0; + + out: + kfree(info); + return -ENOMEM; +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + unsigned int pos; + ssize_t ret; + size_t size; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + info->read = 0; + + ret = ring_buffer_read_page(info->tr->buffer, + &info->spare, + count, + info->cpu, 0); + if (ret < 0) + return 0; + + pos = ring_buffer_page_len(info->spare); + + if (pos < PAGE_SIZE) + memset(info->spare + pos, 0, PAGE_SIZE - pos); + +read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret) + return -EFAULT; + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + + ring_buffer_free_read_page(info->tr->buffer, info->spare); + kfree(info); + + return 0; +} + +struct buffer_ref { + struct ring_buffer *buffer; + void *page; + int ref; +}; + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + buf->private = 0; +} + +static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + ref->ref++; +} + +/* Pipe buffer operations for a buffer. */ +static struct pipe_buf_operations buffer_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = buffer_pipe_buf_release, + .steal = buffer_pipe_buf_steal, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int size, i; + size_t ret; + + /* + * We can't seek on a buffer input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + + for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + break; + + ref->buffer = info->tr->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer); + if (!ref->page) { + kfree(ref); + break; + } + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, info->cpu, 0); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, + ref->page); + kfree(ref); + break; + } + + /* + * zero out any left over data, this is going to + * user land. + */ + size = ring_buffer_page_len(ref->page); + if (size < PAGE_SIZE) + memset(ref->page + size, 0, PAGE_SIZE - size); + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + } + + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + if (flags & SPLICE_F_NONBLOCK) + ret = -EAGAIN; + else + ret = 0; + /* TODO: block */ + return ret; + } + + ret = splice_to_pipe(pipe, &spd); + + return ret; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void) static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; + struct dentry *buffers; struct dentry *entry; int cpu; @@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_marker' entry\n"); + buffers = debugfs_create_dir("binary_buffers", d_tracer); + + if (!buffers) + pr_warning("Could not create buffers directory\n"); + else { + int cpu; + char buf[64]; + + for_each_tracing_cpu(cpu) { + sprintf(buf, "%d", cpu); + + entry = debugfs_create_file(buf, 0444, buffers, + (void *)(long)cpu, + &tracing_buffers_fops); + if (!entry) + pr_warning("Could not create debugfs buffers " + "'%s' entry\n", buf); + } + } + #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e606633fb498..561bb5c5d988 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -217,6 +217,7 @@ enum trace_flag_type { */ struct trace_array_cpu { atomic_t disabled; + void *buffer_page; /* ring buffer spare */ /* these fields get copied into max-trace: */ unsigned long trace_idx; -- cgit v1.2.3-59-g8ed1b From e8fa6b483feebd23ded5eb01afd7a6e82b6078c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:36 -0500 Subject: xfs: prevent kernel crash due to corrupted inode log format Andras Korn reported an oops on log replay causes by a corrupted xfs_inode_log_format_t passing a 0 size to kmem_zalloc. This patch handles to small or too large numbers of log regions gracefully by rejecting the log replay with a useful error message. Signed-off-by: Christoph Hellwig Reported-by: Andras Korn Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 504d540e0e2c..ceeba45e0224 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans( item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xlog_warn( + "XFS: bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ -- cgit v1.2.3-59-g8ed1b From ed93ec3907f063268ced18728d0653f6199d100c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:35 -0500 Subject: xfs: prevent lockdep false positive in xfs_iget_cache_miss The inode can't be locked by anyone else as we just created it a few lines above and it's not been added to any lookup data structure yet. So use a trylock that must succeed to get around the lockdep warnings. Signed-off-by: Christoph Hellwig Reported-by: Alexander Beregalov Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e2fb6210d4c5..478e587087fe 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -246,9 +246,6 @@ xfs_iget_cache_miss( goto out_destroy; } - if (lock_flags) - xfs_ilock(ip, lock_flags); - /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload @@ -256,7 +253,16 @@ xfs_iget_cache_miss( */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; - goto out_unlock; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); } mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); @@ -284,7 +290,6 @@ xfs_iget_cache_miss( out_preload_end: write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); -out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: -- cgit v1.2.3-59-g8ed1b From b79631330a653f568a2ac4eb4a32474c80e3fe77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:37 -0500 Subject: xfs: only issues a cache flush on unmount if barriers are enabled Currently we unconditionally issue a flush from xfs_free_buftarg, but since 2.6.29-rc1 this gives a warning in the style of end_request: I/O error, dev vdb, sector 0 Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 12 ++++++++++-- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925b..aa1016bb9134 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,6 +34,12 @@ #include #include +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); @@ -1435,10 +1441,12 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp) + struct xfs_mount *mp, + struct xfs_buftarg *btp) { xfs_flush_buftarg(btp, 1); - xfs_blkdev_issue_flush(btp); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 288ae7c4c800..9b4d666ad31f 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bc1e64708e2b..8483b35821e0 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -733,15 +733,15 @@ xfs_close_devices( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_logdev_targp); + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); } - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); } /* @@ -810,9 +810,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: if (rtdev) xfs_blkdev_put(rtdev); -- cgit v1.2.3-59-g8ed1b From 1c21f14ec48a2256fb03682b24dddd23eacdc96f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2009 13:51:13 +0100 Subject: lockdep: fix incorrect state name In the recent mark_lock_irq() rework a bug snuck in that would report the state of write locks causing irq inversion under a read lock as a read lock. Fix this by masking the read bit of the state when validating write dependencies. Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra LKML-Reference: <1236172646.5330.7450.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 022d2ed7fd8b..ef6584fd9fe5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2015,7 +2015,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, enum lock_usage_bit bit, const char *name); static int -mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) +mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); int read = new_bit & 1; @@ -2043,7 +2044,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit))) + !usage(curr, this, excl_bit, state_name(new_bit & ~1))) return 0; /* -- cgit v1.2.3-59-g8ed1b From 26575e28df5eb2050c02369843faba38cecb4d8c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2009 14:53:24 +0100 Subject: lockdep: remove extra "irq" string Impact: clarify lockdep printk text print_irq_inversion_bug() gets handed state strings of the form "HARDIRQ", "SOFTIRQ", "RECLAIM_FS" and appends "-irq-{un,}safe" to them, which is either redudant for *IRQ or confusing in the RECLAIM_FS case. Signed-off-by: Peter Zijlstra LKML-Reference: <1236175192.5330.7585.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ef6584fd9fe5..02014f7ccc86 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -1900,9 +1900,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); + printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); + printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); -- cgit v1.2.3-59-g8ed1b From 1075414b06109a99b0e87601e84c74a95bd45681 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2009 12:27:27 +0100 Subject: lockdep: require framepointers for x86 Require framepointers for x86, because otherwise we'll be having empty stack traces, which is useless. Signed-off-by: Peter Zijlstra LKML-Reference: <1236167295.5330.7240.camel@laptop> Signed-off-by: Ingo Molnar --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 29044f500269..9b4efb14a497 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -402,7 +402,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !X86 && !MIPS && !PPC + select FRAME_POINTER if !MIPS && !PPC select KALLSYMS select KALLSYMS_ALL -- cgit v1.2.3-59-g8ed1b From efed792d6738964f399a508ef9e831cd60fa4657 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2009 12:32:55 +0100 Subject: tracing: add lockdep tracepoints for lock acquire/release Augment the traces with lock names when lockdep is available: 1) | down_read_trylock() { 1) | _spin_lock_irqsave() { 1) | /* lock_acquire: &sem->wait_lock */ 1) 4.201 us | } 1) | _spin_unlock_irqrestore() { 1) | /* lock_release: &sem->wait_lock */ 1) 3.523 us | } 1) | /* lock_acquire: try read &mm->mmap_sem */ 1) + 13.386 us | } 1) 1.635 us | find_vma(); 1) | handle_mm_fault() { 1) | __do_fault() { 1) | filemap_fault() { 1) | find_lock_page() { 1) | find_get_page() { 1) | /* lock_acquire: read rcu_read_lock */ 1) | /* lock_release: rcu_read_lock */ 1) 5.697 us | } 1) 8.158 us | } 1) + 11.079 us | } 1) | _spin_lock() { 1) | /* lock_acquire: __pte_lockptr(page) */ 1) 3.949 us | } 1) 1.460 us | page_add_file_rmap(); 1) | _spin_unlock() { 1) | /* lock_release: __pte_lockptr(page) */ 1) 3.115 us | } 1) | unlock_page() { 1) 1.421 us | page_waitqueue(); 1) 1.220 us | __wake_up_bit(); 1) 6.519 us | } 1) + 34.328 us | } 1) + 37.452 us | } 1) | up_read() { 1) | /* lock_release: &mm->mmap_sem */ 1) | _spin_lock_irqsave() { 1) | /* lock_acquire: &sem->wait_lock */ 1) 3.865 us | } 1) | _spin_unlock_irqrestore() { 1) | /* lock_release: &sem->wait_lock */ 1) 8.562 us | } 1) + 17.370 us | } Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?T=F6r=F6k?= Edwin Cc: Jason Baron Cc: Frederic Weisbecker LKML-Reference: <1236166375.5330.7209.camel@laptop> Signed-off-by: Ingo Molnar --- include/trace/lockdep.h | 9 ++++++++ include/trace/lockdep_event_types.h | 44 +++++++++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + kernel/lockdep.c | 17 ++++++++++++++ kernel/trace/trace.c | 14 +++++++----- kernel/trace/trace_events_stage_3.h | 4 ++-- 7 files changed, 82 insertions(+), 8 deletions(-) create mode 100644 include/trace/lockdep.h create mode 100644 include/trace/lockdep_event_types.h diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h new file mode 100644 index 000000000000..5ca67df87f2a --- /dev/null +++ b/include/trace/lockdep.h @@ -0,0 +1,9 @@ +#ifndef _TRACE_LOCKDEP_H +#define _TRACE_LOCKDEP_H + +#include +#include + +#include + +#endif diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h new file mode 100644 index 000000000000..f713d74a82b4 --- /dev/null +++ b/include/trace/lockdep_event_types.h @@ -0,0 +1,44 @@ + +#ifndef TRACE_EVENT_FORMAT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TPPROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TPARGS(lock, subclass, trylock, read, check, next_lock, ip), + TPFMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TPARGS(lock, nested, ip), + TPFMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TPPROTO(struct lockdep_map *lock, unsigned long ip), + TPARGS(lock, ip), + TPFMT("%s", lock->name) + ); + +TRACE_FORMAT(lock_acquired, + TPPROTO(struct lockdep_map *lock, unsigned long ip), + TPARGS(lock, ip), + TPFMT("%s", lock->name) + ); + +#endif +#endif + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index 33c8ed5ccb6c..df56f5694be6 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -2,3 +2,4 @@ #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index ea2ef2051762..fd13750ca4ba 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -2,3 +2,4 @@ #include #include +#include diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 02014f7ccc86..cb70c1db85d0 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,6 +42,7 @@ #include #include #include +#include #include @@ -2913,6 +2914,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +DEFINE_TRACE(lock_acquire); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2923,6 +2926,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + if (unlikely(current->lockdep_recursion)) return; @@ -2937,11 +2942,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); +DEFINE_TRACE(lock_release); + void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; + trace_lock_release(lock, nested, ip); + if (unlikely(current->lockdep_recursion)) return; @@ -3090,10 +3099,14 @@ found_it: lock->ip = ip; } +DEFINE_TRACE(lock_contended); + void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_contended(lock, ip); + if (unlikely(!lock_stat)) return; @@ -3109,10 +3122,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); +DEFINE_TRACE(lock_acquired); + void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; + trace_lock_acquired(lock, ip); + if (unlikely(!lock_stat)) return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 12539f72f4a5..c8abbb0c8397 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -623,7 +623,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; static int cmdline_idx; -static DEFINE_SPINLOCK(trace_cmdline_lock); +static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; /* temporary disable recording */ static atomic_t trace_record_cmdline_disabled __read_mostly; @@ -735,7 +735,7 @@ static void trace_save_cmdline(struct task_struct *tsk) * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!spin_trylock(&trace_cmdline_lock)) + if (!__raw_spin_trylock(&trace_cmdline_lock)) return; idx = map_pid_to_cmdline[tsk->pid]; @@ -753,7 +753,7 @@ static void trace_save_cmdline(struct task_struct *tsk) memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); - spin_unlock(&trace_cmdline_lock); + __raw_spin_unlock(&trace_cmdline_lock); } char *trace_find_cmdline(int pid) @@ -3751,7 +3751,7 @@ static __init int tracer_init_debugfs(void) int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { - static DEFINE_SPINLOCK(trace_buf_lock); + static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; struct ring_buffer_event *event; @@ -3773,7 +3773,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out; pause_graph_tracing(); - spin_lock_irqsave(&trace_buf_lock, irq_flags); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&trace_buf_lock); len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); len = min(len, TRACE_BUF_SIZE-1); @@ -3792,7 +3793,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) ring_buffer_unlock_commit(tr->buffer, event); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, irq_flags); + __raw_spin_unlock(&trace_buf_lock); + raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: preempt_enable_notrace(); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 041789ffbac1..2c8d76c7dbed 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -5,7 +5,7 @@ * * static void ftrace_event_(proto) * { - * event_trace_printk(_RET_IP_, "() " ); + * event_trace_printk(_RET_IP_, ": " ); * } * * static int ftrace_reg_event_(void) @@ -112,7 +112,7 @@ #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ { \ - event_trace_printk(_RET_IP_, "(" #call ") " fmt); \ + event_trace_printk(_RET_IP_, #call ": " fmt); \ } \ \ static int ftrace_reg_event_##call(void) \ -- cgit v1.2.3-59-g8ed1b From e543ad76914abec1acf6631604a4154cd7a2ca6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 18:20:36 -0500 Subject: tracing: add cpu_file intialization for ftrace_dump Impact: fix to ftrace_dump output corruption The commit: b04cc6b1f6398b0e0b60d37e27ce51b4899672ec tracing/core: introduce per cpu tracing files added a new field to the iterator called cpu_file. This was a handle to differentiate between the per cpu trace output files and the all cpu "trace" file. The all cpu "trace" file required setting this to TRACE_PIPE_ALL_CPU. The problem is that the ftrace_dump sets up its own iterator but was not updated to handle this change. The result was only CPU 0 printing out on crash and a lot of "<0>"'s also being printed. Reported-by: Thomas Gleixner Tested-by: Darren Hart Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c8abbb0c8397..ab5cbcae43a1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3918,8 +3918,10 @@ void ftrace_dump(void) printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Simulate the iterator */ iter.tr = &global_trace; iter.trace = current_trace; + iter.cpu_file = TRACE_PIPE_ALL_CPU; /* * We need to stop all tracing on all CPUS to read the -- cgit v1.2.3-59-g8ed1b From 4f3640f8a358f2183a8c966f299eeb55ca523e06 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 3 Mar 2009 23:52:42 -0500 Subject: ring-buffer: fix timestamp in partial ring_buffer_page_read If a partial ring_buffer_page_read happens, then some of the incremental timestamps may be lost. This patch writes the recent timestamp into the page that is passed back to the caller. A partial ring_buffer_page_read is where the full page would not be written back to the user, and instead, just part of the page is copied to the user. A full page would be a page swap with the ring buffer and the timestamps would be correct. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f2a163db52f9..f7473645b9c6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2461,6 +2461,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned long flags; unsigned int commit; unsigned int read; + u64 save_timestamp; int ret = -1; /* @@ -2515,6 +2516,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len < size) goto out; + /* save the current timestamp, since the user will need it */ + save_timestamp = cpu_buffer->read_stamp; + /* Need to copy one event at a time */ do { memcpy(bpage->data + pos, rpage->data + rpos, size); @@ -2531,7 +2535,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* update bpage */ local_set(&bpage->commit, pos); - bpage->time_stamp = rpage->time_stamp; + bpage->time_stamp = save_timestamp; /* we copied everything to the beginning */ read = 0; -- cgit v1.2.3-59-g8ed1b From 2dc5d12b1f43134e9bc5037f69f4739cfdfab93e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 19:10:05 -0500 Subject: tracing: do not return EFAULT if read copied anything Impact: fix trace read to conform to standards Andrew Morton, Theodore Tso and H. Peter Anvin brought to my attention that a userspace read should not return -EFAULT if it succeeded in copying anything. It should only return -EFAULT if it failed to copy at all. This patch modifies the check of copy_from_user and updates the return code appropriately. I also used H. Peter Anvin's short cut rule to just test ret == count. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab5cbcae43a1..57155dc53530 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -346,6 +346,9 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) int len; int ret; + if (!cnt) + return 0; + if (s->len <= s->readpos) return -EBUSY; @@ -353,9 +356,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) if (cnt > len) cnt = len; ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret) + if (ret == cnt) return -EFAULT; + cnt -= ret; + s->readpos += len; return cnt; } @@ -3049,6 +3054,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, ssize_t ret; size_t size; + if (!count) + return 0; + /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; @@ -3073,8 +3081,10 @@ read: size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret) + if (ret == size) return -EFAULT; + size -= ret; + *ppos += size; info->read += size; -- cgit v1.2.3-59-g8ed1b From e74da5235cec6cb71eb338c987f876ecc793138b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 20:31:11 -0500 Subject: tracing: fix seq read from trace files The buffer used by trace_seq was updated incorrectly. Instead of consuming what was actually read, it consumed the rest of the buffer on reads. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 57155dc53530..2e53e6f09440 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -361,7 +361,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) cnt -= ret; - s->readpos += len; + s->readpos += cnt; return cnt; } @@ -380,7 +380,7 @@ ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) if (!ret) return -EFAULT; - s->readpos += len; + s->readpos += cnt; return cnt; } -- cgit v1.2.3-59-g8ed1b From c032ef64d680717e4e8ce3da65da6419a35f8a2c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 20:34:24 -0500 Subject: tracing: add latency output format option With the removal of the latency_trace file, we lost the ability to see some of the finer details in a trace. Like the state of interrupts enabled, the preempt count, need resched, and if we are in an interrupt handler, softirq handler or not. This patch simply creates an option to bring back the old format. This also removes the warning about an unused variable that held the latency_trace file operations. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 24 ++---------------------- kernel/trace/trace.h | 3 ++- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e53e6f09440..55fcbb567950 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -299,6 +299,7 @@ static const char *trace_options[] = { "sym-userobj", "printk-msg-only", "context-info", + "latency-format", NULL }; @@ -1829,26 +1830,12 @@ static int tracing_open(struct inode *inode, struct file *file) iter = __tracing_open(inode, file); if (IS_ERR(iter)) ret = PTR_ERR(iter); - - return ret; -} - -static int tracing_lt_open(struct inode *inode, struct file *file) -{ - struct trace_iterator *iter; - int ret = 0; - - iter = __tracing_open(inode, file); - - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - else + else if (trace_flags & TRACE_ITER_LATENCY_FMT) iter->iter_flags |= TRACE_FILE_LAT_FMT; return ret; } - static void * t_next(struct seq_file *m, void *v, loff_t *pos) { @@ -1927,13 +1914,6 @@ static struct file_operations tracing_fops = { .release = tracing_release, }; -static struct file_operations tracing_lt_fops = { - .open = tracing_lt_open, - .read = seq_read, - .llseek = seq_lseek, - .release = tracing_release, -}; - static struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 561bb5c5d988..12cd119cca32 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -651,7 +651,8 @@ enum trace_iterator_flags { TRACE_ITER_USERSTACKTRACE = 0x4000, TRACE_ITER_SYM_USEROBJ = 0x8000, TRACE_ITER_PRINTK_MSGONLY = 0x10000, - TRACE_ITER_CONTEXT_INFO = 0x20000 /* Print pid/cpu/time */ + TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x40000, }; /* -- cgit v1.2.3-59-g8ed1b From 5fd73f862468280d4cbb5ba4321502f911f9f89a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 21:42:04 -0500 Subject: tracing: remove extra latency_trace method from trace structure Impact: clean up The trace and latency_trace function pointers are identical for every tracer but the function tracer. The differences in the function tracer are trivial (latency output puts paranthesis around parent). This patch removes the latency_trace pointer and all prints will now just use the trace output function pointer. Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 1 - kernel/trace/trace.c | 2 +- kernel/trace/trace_branch.c | 1 - kernel/trace/trace_output.c | 32 -------------------------------- kernel/trace/trace_output.h | 1 - 5 files changed, 1 insertion(+), 36 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e82cb9e930cc..e39679a72a3b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1231,7 +1231,6 @@ static struct tracer blk_tracer __read_mostly = { static struct trace_event trace_blk_event = { .type = TRACE_BLK, .trace = blk_trace_event_print, - .latency_trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 55fcbb567950..21b89ecb8480 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1485,7 +1485,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter) } if (event) - return event->latency_trace(iter, sym_flags); + return event->trace(iter, sym_flags); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) goto partial; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c2e68d440c4d..aaa0755268b9 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -159,7 +159,6 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, .trace = trace_branch_print, - .latency_trace = trace_branch_print, }; static struct tracer branch_trace __read_mostly = diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 9fc815031b09..306fef84c503 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -437,8 +437,6 @@ int register_ftrace_event(struct trace_event *event) if (event->trace == NULL) event->trace = trace_nop_print; - if (event->latency_trace == NULL) - event->latency_trace = trace_nop_print; if (event->raw == NULL) event->raw = trace_nop_print; if (event->hex == NULL) @@ -480,29 +478,6 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) } /* TRACE_FN */ -static enum print_line_t trace_fn_latency(struct trace_iterator *iter, - int flags) -{ - struct ftrace_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - if (!trace_seq_puts(s, " (")) - goto partial; - if (!seq_print_ip_sym(s, field->parent_ip, flags)) - goto partial; - if (!trace_seq_puts(s, ")\n")) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) { struct ftrace_entry *field; @@ -573,7 +548,6 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) static struct trace_event trace_fn_event = { .type = TRACE_FN, .trace = trace_fn_trace, - .latency_trace = trace_fn_latency, .raw = trace_fn_raw, .hex = trace_fn_hex, .binary = trace_fn_bin, @@ -705,7 +679,6 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, static struct trace_event trace_ctx_event = { .type = TRACE_CTX, .trace = trace_ctx_print, - .latency_trace = trace_ctx_print, .raw = trace_ctx_raw, .hex = trace_ctx_hex, .binary = trace_ctxwake_bin, @@ -714,7 +687,6 @@ static struct trace_event trace_ctx_event = { static struct trace_event trace_wake_event = { .type = TRACE_WAKE, .trace = trace_wake_print, - .latency_trace = trace_wake_print, .raw = trace_wake_raw, .hex = trace_wake_hex, .binary = trace_ctxwake_bin, @@ -770,7 +742,6 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter, static struct trace_event trace_special_event = { .type = TRACE_SPECIAL, .trace = trace_special_print, - .latency_trace = trace_special_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, @@ -808,7 +779,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, static struct trace_event trace_stack_event = { .type = TRACE_STACK, .trace = trace_stack_print, - .latency_trace = trace_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, @@ -838,7 +808,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, static struct trace_event trace_user_stack_event = { .type = TRACE_USER_STACK, .trace = trace_user_stack_print, - .latency_trace = trace_user_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, @@ -883,7 +852,6 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) static struct trace_event trace_print_event = { .type = TRACE_PRINT, .trace = trace_print_print, - .latency_trace = trace_print_print, .raw = trace_print_raw, }; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 551a25a72217..8a34d688ed63 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -10,7 +10,6 @@ struct trace_event { struct hlist_node node; int type; trace_print_func trace; - trace_print_func latency_trace; trace_print_func raw; trace_print_func hex; trace_print_func binary; -- cgit v1.2.3-59-g8ed1b From 27d48be84477d2f0a2e2ac3738a3971dece631d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 21:57:29 -0500 Subject: tracing: consolidate print_lat_fmt and print_trace_fmt Impact: clean up Both print_lat_fmt and print_trace_fmt do pretty much the same thing except for one different function call. This patch consolidates the two functions and adds an if statement to perform the difference. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 39 +++++++-------------------------------- 1 file changed, 7 insertions(+), 32 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 21b89ecb8480..d1ef43999d9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1468,33 +1468,6 @@ static void test_cpu_buff_start(struct trace_iterator *iter) trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); } -static enum print_line_t print_lat_fmt(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_event *event; - struct trace_entry *entry = iter->ent; - - test_cpu_buff_start(iter); - - event = ftrace_find_event(entry->type); - - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_print_lat_context(iter)) - goto partial; - } - - if (event) - return event->trace(iter, sym_flags); - - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; - - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; -} - static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1509,8 +1482,13 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_print_context(iter)) - goto partial; + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + if (!trace_print_lat_context(iter)) + goto partial; + } else { + if (!trace_print_context(iter)) + goto partial; + } } if (event) @@ -1652,9 +1630,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) if (trace_flags & TRACE_ITER_RAW) return print_raw_fmt(iter); - if (iter->iter_flags & TRACE_FILE_LAT_FMT) - return print_lat_fmt(iter); - return print_trace_fmt(iter); } -- cgit v1.2.3-59-g8ed1b From e9d25fe6eaa2c720bb3ea661b660e58d54fa38bf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Mar 2009 22:15:30 -0500 Subject: tracing: have latency tracers set the latency format The latency tracers (irqsoff, preemptoff, preemptirqsoff, and wakeup) are pretty useless with the default output format. This patch makes them automatically enable the latency format when they are selected. They also record the state of the latency option, and if it was not enabled when selected, they disable it on reset. Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 8 ++++++++ kernel/trace/trace_sched_wakeup.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9e5ebd844158..b923d13e2fad 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,6 +32,8 @@ enum { static int trace_type __read_mostly; +static int save_lat_flag; + #ifdef CONFIG_PREEMPT_TRACER static inline int preempt_trace(void) @@ -370,6 +372,9 @@ static void stop_irqsoff_tracer(struct trace_array *tr) static void __irqsoff_tracer_init(struct trace_array *tr) { + save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; + trace_flags |= TRACE_ITER_LATENCY_FMT; + tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ @@ -380,6 +385,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { stop_irqsoff_tracer(tr); + + if (!save_lat_flag) + trace_flags &= ~TRACE_ITER_LATENCY_FMT; } static void irqsoff_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index db55f7aaa640..3c5ad6b2ec84 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -32,6 +32,8 @@ static raw_spinlock_t wakeup_lock = static void __wakeup_reset(struct trace_array *tr); +static int save_lat_flag; + #ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: @@ -324,6 +326,9 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { + save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; + trace_flags |= TRACE_ITER_LATENCY_FMT; + tracing_max_latency = 0; wakeup_trace = tr; start_wakeup_tracer(tr); @@ -347,6 +352,9 @@ static void wakeup_tracer_reset(struct trace_array *tr) stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); + + if (!save_lat_flag) + trace_flags &= ~TRACE_ITER_LATENCY_FMT; } static void wakeup_tracer_start(struct trace_array *tr) -- cgit v1.2.3-59-g8ed1b From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 10:24:48 +0100 Subject: tracing: rename ftrace_printk() => trace_printk() Impact: cleanup Use a more generic name - this also allows the prototype to move to kernel.h and be generally available to kernel developers who want to do some quick tracing. Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 6 +++--- include/linux/ftrace.h | 18 +++++++++--------- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace.h | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 2041ee951c1a..22614bef6359 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -1466,11 +1466,11 @@ want, depending on your needs. You can put some comments on specific functions by using -ftrace_printk() For example, if you want to put a comment inside +trace_printk() For example, if you want to put a comment inside the __might_sleep() function, you just have to include - and call ftrace_printk() inside __might_sleep() + and call trace_printk() inside __might_sleep() -ftrace_printk("I'm a comment!\n") +trace_printk("I'm a comment!\n") will produce: diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f69ac7c1587..fbb9c364e166 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -329,11 +329,11 @@ extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); /** - * ftrace_printk - printf formatting in the ftrace buffer + * trace_printk - printf formatting in the ftrace buffer * @fmt: the printf format for printing * - * Note: __ftrace_printk is an internal function for ftrace_printk and - * the @ip is passed in via the ftrace_printk macro. + * Note: __trace_printk is an internal function for trace_printk and + * the @ip is passed in via the trace_printk macro. * * This function allows a kernel developer to debug fast path sections * that printk is not appropriate for. By scattering in various @@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); * where problems are occurring. * * This is intended as a debugging tool for the developer only. - * Please refrain from leaving ftrace_printks scattered around in + * Please refrain from leaving trace_printks scattered around in * your code. */ -# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt) +# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) extern int -__ftrace_printk(unsigned long ip, const char *fmt, ...) +__trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap) +# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); extern void ftrace_dump(void); @@ -356,13 +356,13 @@ extern void ftrace_dump(void); static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } static inline int -ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); static inline void tracing_start(void) { } static inline void tracing_stop(void) { } static inline void ftrace_off_permanent(void) { } static inline int -ftrace_printk(const char *fmt, ...) +trace_printk(const char *fmt, ...) { return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1ef43999d9e..c0e9c1263393 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -48,7 +48,7 @@ unsigned long __read_mostly tracing_thresh; * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the * entries inserted during the selftest although some concurrent - * insertions into the ring-buffer such as ftrace_printk could occurred + * insertions into the ring-buffer such as trace_printk could occurred * at the same time, giving false positive or negative results. */ static bool __read_mostly tracing_selftest_running; @@ -291,7 +291,7 @@ static const char *trace_options[] = { "block", "stacktrace", "sched-tree", - "ftrace_printk", + "trace_printk", "ftrace_preempt", "branch", "annotate", @@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -int __ftrace_printk(unsigned long ip, const char *fmt, ...) +int __trace_printk(unsigned long ip, const char *fmt, ...) { int ret; va_list ap; @@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...) va_end(ap); return ret; } -EXPORT_SYMBOL_GPL(__ftrace_printk); +EXPORT_SYMBOL_GPL(__trace_printk); int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 12cd119cca32..8beff03fda68 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -115,7 +115,7 @@ struct userstack_entry { }; /* - * ftrace_printk entry: + * trace_printk entry: */ struct print_entry { struct trace_entry ent; -- cgit v1.2.3-59-g8ed1b From 526211bc58c4b3265352801c5a7f469af5c34711 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 10:28:45 +0100 Subject: tracing: move utility functions from ftrace.h to kernel.h Make common utility functions such as trace_printk() and tracing_start()/tracing_stop() generally available to kernel code. Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 58 ++------------------------------------------------ include/linux/kernel.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 56 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index fbb9c364e166..5b64303ec9f2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -318,62 +318,6 @@ static inline void __ftrace_enabled_restore(int enabled) # define trace_preempt_off(a0, a1) do { } while (0) #endif -#ifdef CONFIG_TRACING -extern int ftrace_dump_on_oops; - -extern void tracing_start(void); -extern void tracing_stop(void); -extern void ftrace_off_permanent(void); - -extern void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); - -/** - * trace_printk - printf formatting in the ftrace buffer - * @fmt: the printf format for printing - * - * Note: __trace_printk is an internal function for trace_printk and - * the @ip is passed in via the trace_printk macro. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please refrain from leaving trace_printks scattered around in - * your code. - */ -# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) -extern int -__trace_printk(unsigned long ip, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) -extern int -__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); -extern void ftrace_dump(void); -#else -static inline void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } -static inline int -trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); - -static inline void tracing_start(void) { } -static inline void tracing_stop(void) { } -static inline void ftrace_off_permanent(void) { } -static inline int -trace_printk(const char *fmt, ...) -{ - return 0; -} -static inline int -ftrace_vprintk(const char *fmt, va_list ap) -{ - return 0; -} -static inline void ftrace_dump(void) { } -#endif - #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); extern void ftrace_init_module(struct module *mod, @@ -542,6 +486,8 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) return tsk->trace & TSK_TRACE_FL_GRAPH; } +extern int ftrace_dump_on_oops; + #endif /* CONFIG_TRACING */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7fa371898e3e..08bf5da86676 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -367,6 +367,64 @@ static inline char *pack_hex_byte(char *buf, u8 byte) ({ if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); 0; }) #endif +/* + * General tracing related utility functions - trace_printk(), + * tracing_start()/tracing_stop: + */ +#ifdef CONFIG_TRACING +extern void tracing_start(void); +extern void tracing_stop(void); +extern void ftrace_off_permanent(void); + +extern void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); + +/** + * trace_printk - printf formatting in the ftrace buffer + * @fmt: the printf format for printing + * + * Note: __trace_printk is an internal function for trace_printk and + * the @ip is passed in via the trace_printk macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_printks scattered around in + * your code. + */ +# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) +extern int +__trace_printk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) +extern int +__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); +extern void ftrace_dump(void); +#else +static inline void +ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } +static inline int +trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); + +static inline void tracing_start(void) { } +static inline void tracing_stop(void) { } +static inline void ftrace_off_permanent(void) { } +static inline int +trace_printk(const char *fmt, ...) +{ + return 0; +} +static inline int +ftrace_vprintk(const char *fmt, va_list ap) +{ + return 0; +} +static inline void ftrace_dump(void) { } +#endif + /* * Display an IP address in readable format. */ -- cgit v1.2.3-59-g8ed1b From 03d78913f01e8f6599823f00357ed17b32747d3d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Mar 2009 02:29:05 -0800 Subject: lockdep: remove duplicate CONFIG_DEBUG_LOCKDEP definitions Impact: cleanup The atomic debug modifiers are already defined in kernel/lockdep_internals.h. Signed-off-by: David Rientjes Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 02014f7ccc86..9a1e2bcc4b8d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks; atomic_t nr_find_usage_forwards_recursions; atomic_t nr_find_usage_backwards_checks; atomic_t nr_find_usage_backwards_recursions; -# define debug_atomic_inc(ptr) atomic_inc(ptr) -# define debug_atomic_dec(ptr) atomic_dec(ptr) -# define debug_atomic_read(ptr) atomic_read(ptr) -#else -# define debug_atomic_inc(ptr) do { } while (0) -# define debug_atomic_dec(ptr) do { } while (0) -# define debug_atomic_read(ptr) 0 #endif /* -- cgit v1.2.3-59-g8ed1b From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 5 Mar 2009 01:49:22 +0100 Subject: tracing/function-graph-tracer: use the more lightweight local clock Impact: decrease hangs risks with the graph tracer on slow systems Since the function graph tracer can spend too much time on timer interrupts, it's better now to use the more lightweight local clock. Anyway, the function graph traces are more reliable on a per cpu trace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 2 +- include/linux/ftrace.h | 13 +++++++------ kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3925ec0184b1..a85da1764b1c 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = cpu_clock(raw_smp_processor_id()); + calltime = trace_clock_local(); if (ftrace_push_return_trace(old, calltime, self_addr, &trace.depth) == -EBUSY) { diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f69ac7c1587..6ea62acbe4b6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1,15 +1,16 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H -#include -#include -#include -#include -#include -#include +#include #include +#include #include +#include +#include #include +#include +#include +#include #include diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c009553a8e81..e527f2f66c73 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void) unsigned long ret; ftrace_pop_return_trace(&trace, &ret); - trace.rettime = cpu_clock(raw_smp_processor_id()); + trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); if (unlikely(!ret)) { -- cgit v1.2.3-59-g8ed1b From 2002c258faaa8f89543df284fdbaa9e4b171547f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Mar 2009 10:35:56 -0500 Subject: tracing: add tracing_on/tracing_off to kernel.h Impact: cleanup The functions tracing_start/tracing_stop have been moved to kernel.h. These are not the functions a developer most likely wants to use when they want to insert a place to stop tracing and restart it from user space. tracing_start/tracing_stop was created to work with things like suspend to ram, where even calling smp_processor_id() can crash the system. The tracing_start/tracing_stop was used to stop the tracer from doing anything. These are still light weight functions, but add a bit more overhead to be able to stop the tracers. They also have no interface back to userland. That is, if the kernel calls tracing_stop, userland can not start tracing. What a developer most likely wants to use is tracing_on/tracing_off. These are very light weight functions (simply sets or clears a bit). These functions just stop recording into the ring buffer. The tracers don't even know that this happens except that they would receive NULL from the ring_buffer_lock_reserve function. Also, there's a way for the user land to enable or disable this bit. In debugfs/tracing/tracing_on, a user may echo "0" (same as tracing_off()) or echo "1" (same as tracing_on()) into this file. This becomes handy when a kernel developer is debugging and wants tracing to turn off when it hits an anomaly. Then the developer can examine the trace, and restart tracing if they want to try again (echo 1 > tracing_on). This patch moves the prototypes for tracing_on/tracing_off to kernel.h and comments their use, so that a kernel developer will know how to use them. Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 29 ++++++++++++++++++++++++++++- include/linux/ring_buffer.h | 15 --------------- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 08bf5da86676..d4614a8a034b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -369,8 +369,35 @@ static inline char *pack_hex_byte(char *buf, u8 byte) /* * General tracing related utility functions - trace_printk(), - * tracing_start()/tracing_stop: + * tracing_on/tracing_off and tracing_start()/tracing_stop + * + * Use tracing_on/tracing_off when you want to quickly turn on or off + * tracing. It simply enables or disables the recording of the trace events. + * This also corresponds to the user space debugfs/tracing/tracing_on + * file, which gives a means for the kernel and userspace to interact. + * Place a tracing_off() in the kernel where you want tracing to end. + * From user space, examine the trace, and then echo 1 > tracing_on + * to continue tracing. + * + * tracing_stop/tracing_start has slightly more overhead. It is used + * by things like suspend to ram where disabling the recording of the + * trace is not enough, but tracing must actually stop because things + * like calling smp_processor_id() may crash the system. + * + * Most likely, you want to use tracing_on/tracing_off. */ +#ifdef CONFIG_RING_BUFFER +void tracing_on(void); +void tracing_off(void); +/* trace_off_permanent stops recording with no way to bring it back */ +void tracing_off_permanent(void); +int tracing_is_on(void); +#else +static inline void tracing_on(void) { } +static inline void tracing_off(void) { } +static inline void tracing_off_permanent(void) { } +static inline int tracing_is_on(void) { return 0; } +#endif #ifdef CONFIG_TRACING extern void tracing_start(void); extern void tracing_stop(void); diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 79fcbc4b09d6..b1a0068a5557 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -124,21 +124,6 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); size_t ring_buffer_page_len(void *page); -/* - * The below functions are fine to use outside the tracing facility. - */ -#ifdef CONFIG_RING_BUFFER -void tracing_on(void); -void tracing_off(void); -void tracing_off_permanent(void); -int tracing_is_on(void); -#else -static inline void tracing_on(void) { } -static inline void tracing_off(void) { } -static inline void tracing_off_permanent(void) { } -static inline int tracing_is_on(void) { return 0; } -#endif - void *ring_buffer_alloc_read_page(struct ring_buffer *buffer); void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, -- cgit v1.2.3-59-g8ed1b From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 21:19:55 +0100 Subject: tracing: clean up menu Clean up menu structure, introduce TRACING_SUPPORT switch that signals whether an architecture supports various instrumentation mechanisms. Signed-off-by: Ingo Molnar --- arch/Kconfig | 1 + kernel/trace/Kconfig | 29 ++++++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 550dab22daa1..a092dc77c24d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -6,6 +6,7 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE + depends on TRACING_SUPPORT select TRACING select RING_BUFFER help diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 999c6a2485df..5d733da5345a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,12 +53,22 @@ config TRACING select TRACEPOINTS select NOP_TRACER +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + depends on TRACE_IRQFLAGS_SUPPORT + depends on STACKTRACE_SUPPORT + +if TRACING_SUPPORT + menu "Tracers" config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FRAME_POINTER select KALLSYMS select TRACING @@ -91,7 +101,6 @@ config IRQSOFF_TRACER default n depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME - depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -114,7 +123,6 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -142,7 +150,6 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -152,7 +159,6 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" - depends on DEBUG_KERNEL select TRACING select MARKERS help @@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER config EVENT_TRACER bool "Trace various events in the kernel" - depends on DEBUG_KERNEL select TRACING help This tracer hooks to various trace points in the kernel @@ -170,7 +175,6 @@ config EVENT_TRACER config BOOT_TRACER bool "Trace boot initcalls" - depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER help @@ -188,7 +192,6 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool "Trace likely/unlikely profiler" - depends on DEBUG_KERNEL select TRACING help This tracer profiles all the the likely and unlikely macros @@ -241,7 +244,6 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" - depends on DEBUG_KERNEL depends on X86 select TRACING help @@ -253,7 +255,6 @@ config POWER_TRACER config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER - depends on DEBUG_KERNEL select FUNCTION_TRACER select STACKTRACE select KALLSYMS @@ -343,7 +344,6 @@ config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE - depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -369,7 +369,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING && DEBUG_KERNEL + depends on TRACING select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" - depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI + depends on HAVE_MMIOTRACE_SUPPORT && PCI select TRACING help Mmiotrace traces Memory Mapped I/O access and is meant for @@ -401,3 +401,6 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. endmenu + +endif # TRACING_SUPPORT + -- cgit v1.2.3-59-g8ed1b From 5e2336a0d47c9661a40cc5ef85135ce1406af6e8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Mar 2009 21:44:55 -0500 Subject: tracing: make all file_operations const Impact: cleanup All file_operations structures should be constant. No one is going to change them. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 10 +++++----- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.c | 24 ++++++++++++------------ kernel/trace/trace_sysprof.c | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5a3a06b21eee..d7a06a0d9447 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1869,21 +1869,21 @@ ftrace_notrace_release(struct inode *inode, struct file *file) return ftrace_regex_release(inode, file, 0); } -static struct file_operations ftrace_avail_fops = { +static const struct file_operations ftrace_avail_fops = { .open = ftrace_avail_open, .read = seq_read, .llseek = seq_lseek, .release = ftrace_avail_release, }; -static struct file_operations ftrace_failures_fops = { +static const struct file_operations ftrace_failures_fops = { .open = ftrace_failures_open, .read = seq_read, .llseek = seq_lseek, .release = ftrace_avail_release, }; -static struct file_operations ftrace_filter_fops = { +static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = ftrace_regex_read, .write = ftrace_filter_write, @@ -1891,7 +1891,7 @@ static struct file_operations ftrace_filter_fops = { .release = ftrace_filter_release, }; -static struct file_operations ftrace_notrace_fops = { +static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = ftrace_regex_read, .write = ftrace_notrace_write, @@ -2423,7 +2423,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations ftrace_pid_fops = { +static const struct file_operations ftrace_pid_fops = { .read = ftrace_pid_read, .write = ftrace_pid_write, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f7473645b9c6..178858492a89 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2606,7 +2606,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations rb_simple_fops = { +static const struct file_operations rb_simple_fops = { .open = tracing_open_generic, .read = rb_simple_read, .write = rb_simple_write, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0e9c1263393..e6144acf2b75 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1882,14 +1882,14 @@ static int show_traces_open(struct inode *inode, struct file *file) return ret; } -static struct file_operations tracing_fops = { +static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .llseek = seq_lseek, .release = tracing_release, }; -static struct file_operations show_traces_fops = { +static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, @@ -1982,7 +1982,7 @@ err_unlock: return err; } -static struct file_operations tracing_cpumask_fops = { +static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, @@ -2134,7 +2134,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations tracing_iter_fops = { +static const struct file_operations tracing_iter_fops = { .open = tracing_open_generic, .read = tracing_trace_options_read, .write = tracing_trace_options_write, @@ -2167,7 +2167,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, readme_msg, strlen(readme_msg)); } -static struct file_operations tracing_readme_fops = { +static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, }; @@ -2927,25 +2927,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations tracing_max_lat_fops = { +static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, }; -static struct file_operations tracing_ctrl_fops = { +static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, }; -static struct file_operations set_tracer_fops = { +static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, }; -static struct file_operations tracing_pipe_fops = { +static const struct file_operations tracing_pipe_fops = { .open = tracing_open_pipe, .poll = tracing_poll_pipe, .read = tracing_read_pipe, @@ -2953,13 +2953,13 @@ static struct file_operations tracing_pipe_fops = { .release = tracing_release_pipe, }; -static struct file_operations tracing_entries_fops = { +static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, }; -static struct file_operations tracing_mark_fops = { +static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, }; @@ -3240,7 +3240,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, return r; } -static struct file_operations tracing_dyn_info_fops = { +static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, }; diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index c771af4e8f1a..91fd19c2149f 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -314,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf, return cnt; } -static struct file_operations sysprof_sample_fops = { +static const struct file_operations sysprof_sample_fops = { .read = sysprof_sample_read, .write = sysprof_sample_write, }; -- cgit v1.2.3-59-g8ed1b From 33b0c229e3abeae00493ed1d6f0b07191977a0a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Mar 2009 11:45:43 -0500 Subject: tracing: move print of event format to separate file Impact: clean up Move the macro that creates the event format file to a separate header. This will allow the default ftrace events to use this same macro to create the formats to read those events. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_2.h | 53 +---------------------------------- kernel/trace/trace_format.h | 55 +++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 52 deletions(-) create mode 100644 kernel/trace/trace_format.h diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index b1cebba1d9b4..d24a97e74aea 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -75,56 +75,5 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #include -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " size:%d; offset:%d;\n", - * sizeof(field.type), - * offsetof(struct ftrace_raw_##call, - * item)); - * - * } - */ - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%lu;\tsize:%lu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (!ret) \ - return 0; - - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%lu;\tsize:%lu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} - +#include "trace_format.h" #include diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h new file mode 100644 index 000000000000..53a6b1357116 --- /dev/null +++ b/kernel/trace/trace_format.h @@ -0,0 +1,55 @@ +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " size:%d; offset:%d;\n", + * sizeof(field.type), + * offsetof(struct ftrace_raw_##call, + * item)); + * + * } + */ + +#undef TRACE_STRUCT +#define TRACE_STRUCT(args...) args + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%lu;\tsize:%lu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; + + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ + "offset:%lu;\tsize:%lu;\n", \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + -- cgit v1.2.3-59-g8ed1b From 770cb24345c0f6e0d47bd2b94aa6d67bea6f8b54 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 5 Mar 2009 21:35:29 -0500 Subject: tracing: add format files for ftrace default entries Impact: allow user apps to read binary format of basic ftrace entries Currently, only defined raw events export their formats so a binary reader can parse them. There's no reason that the default ftrace entries can't export their formats. This patch adds a subsystem called "ftrace" in the events directory that includes the ftrace entries for basic ftrace recorded items. These only have three files in the events directory: type : printf available_types : printf format : format for the event entry For example: # cat /debug/tracing/events/ftrace/wakeup/format name: wakeup ID: 3 format: field:unsigned char type; offset:0; size:1; field:unsigned char flags; offset:1; size:1; field:unsigned char preempt_count; offset:2; size:1; field:int pid; offset:4; size:4; field:int tgid; offset:8; size:4; field:unsigned int prev_pid; offset:12; size:4; field:unsigned char prev_prio; offset:16; size:1; field:unsigned char prev_state; offset:17; size:1; field:unsigned int next_pid; offset:20; size:4; field:unsigned char next_prio; offset:24; size:1; field:unsigned char next_state; offset:25; size:1; field:unsigned int next_cpu; offset:28; size:4; print fmt: "%u:%u:%u ==+ %u:%u:%u [%03u]" Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 1 + kernel/trace/trace_event_types.h | 165 +++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 12 +-- kernel/trace/trace_export.c | 81 +++++++++++++++++++ kernel/trace/trace_format.h | 2 +- 5 files changed, 255 insertions(+), 6 deletions(-) create mode 100644 kernel/trace/trace_event_types.h create mode 100644 kernel/trace/trace_export.c diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c931fe0560cb..f44736c7574a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACER) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o +obj-$(CONFIG_EVENT_TRACER) += trace_export.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h new file mode 100644 index 000000000000..fb4eba166433 --- /dev/null +++ b/kernel/trace/trace_event_types.h @@ -0,0 +1,165 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace + +/* + * We cheat and use the proto type field as the ID + * and args as the entry type (minus 'struct') + */ +TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(unsigned long, parent_ip, parent_ip) + ), + TPRAWFMT(" %lx <-- %lx") +); + +TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, + ftrace_graph_ent_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, graph_ent.func, func) + TRACE_FIELD(int, graph_ent.depth, depth) + ), + TPRAWFMT("--> %lx (%d)") +); + +TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, + ftrace_graph_ret_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ret.func, func) + TRACE_FIELD(int, ret.depth, depth) + ), + TPRAWFMT("<-- %lx (%d)") +); + +TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned int, prev_pid, prev_pid) + TRACE_FIELD(unsigned char, prev_prio, prev_prio) + TRACE_FIELD(unsigned char, prev_state, prev_state) + TRACE_FIELD(unsigned int, next_pid, next_pid) + TRACE_FIELD(unsigned char, next_prio, next_prio) + TRACE_FIELD(unsigned char, next_state, next_state) + TRACE_FIELD(unsigned int, next_cpu, next_cpu) + ), + TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") +); + +TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned int, prev_pid, prev_pid) + TRACE_FIELD(unsigned char, prev_prio, prev_prio) + TRACE_FIELD(unsigned char, prev_state, prev_state) + TRACE_FIELD(unsigned int, next_pid, next_pid) + TRACE_FIELD(unsigned char, next_prio, next_prio) + TRACE_FIELD(unsigned char, next_state, next_state) + TRACE_FIELD(unsigned int, next_cpu, next_cpu) + ), + TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") +); + +TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, arg1, arg1) + TRACE_FIELD(unsigned long, arg2, arg2) + TRACE_FIELD(unsigned long, arg3, arg3) + ), + TPRAWFMT("(%08lx) (%08lx) (%08lx)") +); + +/* + * Stack-trace entry: + */ + +/* #define FTRACE_STACK_ENTRIES 8 */ + +TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, caller[0], stack0) + TRACE_FIELD(unsigned long, caller[1], stack1) + TRACE_FIELD(unsigned long, caller[2], stack2) + TRACE_FIELD(unsigned long, caller[3], stack3) + TRACE_FIELD(unsigned long, caller[4], stack4) + TRACE_FIELD(unsigned long, caller[5], stack5) + TRACE_FIELD(unsigned long, caller[6], stack6) + TRACE_FIELD(unsigned long, caller[7], stack7) + ), + TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") +); + +TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, caller[0], stack0) + TRACE_FIELD(unsigned long, caller[1], stack1) + TRACE_FIELD(unsigned long, caller[2], stack2) + TRACE_FIELD(unsigned long, caller[3], stack3) + TRACE_FIELD(unsigned long, caller[4], stack4) + TRACE_FIELD(unsigned long, caller[5], stack5) + TRACE_FIELD(unsigned long, caller[6], stack6) + TRACE_FIELD(unsigned long, caller[7], stack7) + ), + TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") +); + +TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(unsigned int, depth, depth) + TRACE_FIELD_ZERO_CHAR(buf) + ), + TPRAWFMT("%08lx (%d) %s") +); + +TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned int, line, line) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD(char, correct, correct) + ), + TPRAWFMT("%u:%s:%s (%u)") +); + +TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(u64, from, from) + TRACE_FIELD(u64, to, to) + ), + TPRAWFMT("from: %llx to: %llx") +); + +TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, + TRACE_STRUCT( + TRACE_FIELD(ktime_t, state_data.stamp, stamp) + TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD(int, state_data.type, type) + TRACE_FIELD(int, state_data.state, state) + ), + TPRAWFMT("%llx->%llx type:%u state:%u") +); + +TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) + TRACE_FIELD(unsigned long, call_site, call_site) + TRACE_FIELD(const void *, ptr, ptr) + TRACE_FIELD(size_t, bytes_req, bytes_req) + TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) + TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) + TRACE_FIELD(int, node, node) + ), + TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + " flags:%x node:%d") +); + +TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) + TRACE_FIELD(unsigned long, call_site, call_site) + TRACE_FIELD(const void *, ptr, ptr) + ), + TPRAWFMT("type:%u call_site:%lx ptr:%p") +); + +#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 210e71ff82db..4488d90e75ef 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -656,11 +656,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); + if (call->regfunc) { + entry = debugfs_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/enable' entry\n", call->name); + } /* Only let type be writable, if we can change it */ entry = debugfs_create_file("type", diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c new file mode 100644 index 000000000000..0fb7be73e31c --- /dev/null +++ b/kernel/trace/trace_export.c @@ -0,0 +1,81 @@ +/* + * trace_export.c - export basic ftrace utilities to user space + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace_output.h" + +#include "trace_format.h" + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) \ + ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ + "offset:%lu;\tsize:0;\n", \ + offsetof(typeof(field), item)); \ + if (!ret) \ + return 0; + + +#undef TPRAWFMT +#define TPRAWFMT(args...) args + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + +#include "trace_event_types.h" + +#undef TRACE_ZERO_CHAR +#define TRACE_ZERO_CHAR(arg) + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign)\ + entry->item = assign; + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign)\ + entry->item = assign; + +#undef TPCMD +#define TPCMD(cmd...) cmd + +#undef TRACE_ENTRY +#define TRACE_ENTRY entry + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + cmd; + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + \ +static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .show_format = ftrace_format_##call, \ +} +#include "trace_event_types.h" diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h index 53a6b1357116..03f9a4c165ca 100644 --- a/kernel/trace/trace_format.h +++ b/kernel/trace/trace_format.h @@ -40,7 +40,7 @@ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -int \ +static int \ ftrace_format_##call(struct trace_seq *s) \ { \ struct ftrace_raw_##call field; \ -- cgit v1.2.3-59-g8ed1b From 422d3c7a577b15e1384c9d4e72a9540896b685fa Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 6 Mar 2009 10:40:53 +0900 Subject: tracing: current tip/master can't enable ftrace After commit 40ada30f9621fbd831ac2437b9a2a399aad34b00, "make menuconfig" doesn't display "Tracer" item. Following modification restores it. --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d733da5345a..058d949a3214 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -61,6 +61,7 @@ config TRACING_SUPPORT bool depends on TRACE_IRQFLAGS_SUPPORT depends on STACKTRACE_SUPPORT + default y if TRACING_SUPPORT -- cgit v1.2.3-59-g8ed1b From 10dd3ebe213c31bff14b4dae3c5d32a76b1fad7c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 6 Mar 2009 15:29:04 +0900 Subject: tracing: fix deadlock when setting set_ftrace_pid Impact: fix deadlock while using set_ftrace_pid Reproducer: # cd /sys/kernel/debug/tracing # echo $$ > set_ftrace_pid then, console becomes hung. Details: when writing set_ftracepid, kernel callstack is following ftrace_pid_write() mutex_lock(&ftrace_lock); ftrace_update_pid_func() mutex_lock(&ftrace_lock); mutex_unlock(&ftrace_lock); mutex_unlock(&ftrace_lock); then, system always deadlocks when ftrace_pid_write() is called. In past days, ftrace_pid_write() used ftrace_start_lock, but commit e6ea44e9b4c12325337cd1c06103cd515a1c02b2 consolidated ftrace_start_lock to ftrace_lock. Signed-off-by: KOSAKI Motohiro Reviewed-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <20090306151155.0778.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d7a06a0d9447..d33d306bdcf4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -218,10 +218,8 @@ static void ftrace_update_pid_func(void) { ftrace_func_t func; - mutex_lock(&ftrace_lock); - if (ftrace_trace_function == ftrace_stub) - goto out; + return; func = ftrace_trace_function; @@ -238,9 +236,6 @@ static void ftrace_update_pid_func(void) #else __ftrace_trace_function = func; #endif - - out: - mutex_unlock(&ftrace_lock); } /* set when tracing only a pid */ -- cgit v1.2.3-59-g8ed1b From af438c0f114b6f731b923b5c07150f6159471502 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Mar 2009 12:47:08 +0100 Subject: tracing, power-trace: make it build even if the power-tracer is turned off Impact: build fix The 'struct power_trace' definition is needed (for the event tracer) even if the power-tracer plugin is turned off in the .config. Cc: Steven Rostedt LKML-Reference: <20090306104106.GF31042@elte.hu> Signed-off-by: Ingo Molnar --- include/trace/power.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/trace/power.h b/include/trace/power.h index 2c733e58e89c..38aca537e497 100644 --- a/include/trace/power.h +++ b/include/trace/power.h @@ -11,12 +11,10 @@ enum { }; struct power_trace { -#ifdef CONFIG_POWER_TRACER ktime_t stamp; ktime_t end; int type; int state; -#endif }; DECLARE_TRACE(power_start, -- cgit v1.2.3-59-g8ed1b From 0e39ac444636ff5be39b26f1cb56d79594654dda Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 6 Mar 2009 10:35:52 -0500 Subject: tracing, Text Edit Lock - Architecture Independent Code This is an architecture independant synchronization around kernel text modifications through use of a global mutex. A mutex has been chosen so that kprobes, the main user of this, can sleep during memory allocation between the memory read of the instructions it must replace and the memory write of the breakpoint. Other user of this interface: immediate values. Paravirt and alternatives are always done when SMP is inactive, so there is no need to use locks. Signed-off-by: Mathieu Desnoyers LKML-Reference: <49B142D8.7020601@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/memory.h | 6 ++++++ mm/memory.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/memory.h b/include/linux/memory.h index 3fdc10806d31..86a6c0f0518d 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -99,4 +99,10 @@ enum mem_add_context { BOOT, HOTPLUG }; #define hotplug_memory_notifier(fn, pri) do { } while (0) #endif +/* + * Kernel text modification mutex, used for code patching. Users of this lock + * can sleep. + */ +extern struct mutex text_mutex; + #endif /* _LINUX_MEMORY_H_ */ diff --git a/mm/memory.c b/mm/memory.c index baa999e87cd2..05fab3bc5b4b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #include #include #include @@ -99,6 +101,14 @@ int randomize_va_space __read_mostly = 2; #endif +/* + * mutex protecting text section modification (dynamic code patching). + * some users need to sleep (allocating memory...) while they hold this lock. + * + * NOT exported to modules - patching kernel text is a really delicate matter. + */ +DEFINE_MUTEX(text_mutex); + static int __init disable_randmaps(char *s) { randomize_va_space = 0; -- cgit v1.2.3-59-g8ed1b From 4460fdad85becd569f11501ad5b91814814335ff Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 6 Mar 2009 10:36:38 -0500 Subject: tracing, Text Edit Lock - kprobes architecture independent support Use the mutual exclusion provided by the text edit lock in the kprobes code. It allows coherent manipulation of the kernel code by other subsystems. Changelog: Move the kernel_text_lock/unlock out of the for loops. Use text_mutex directly instead of a function. Remove whitespace modifications. (note : kprobes_mutex is always taken outside of text_mutex) Signed-off-by: Mathieu Desnoyers Acked-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu LKML-Reference: <49B14306.2080202@redhat.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7ba8cd9845cb..479d4d5672f9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p) goto out; } + mutex_lock(&text_mutex); ret = arch_prepare_kprobe(p); if (ret) - goto out; + goto out_unlock_text; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, @@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p) if (kprobe_enabled) arch_arm_kprobe(p); +out_unlock_text: + mutex_unlock(&text_mutex); out: mutex_unlock(&kprobe_mutex); @@ -746,8 +750,11 @@ valid_p: * enabled and not gone - otherwise, the breakpoint would * already have been removed. We save on flushing icache. */ - if (kprobe_enabled && !kprobe_gone(old_p)) + if (kprobe_enabled && !kprobe_gone(old_p)) { + mutex_lock(&text_mutex); arch_disarm_kprobe(p); + mutex_unlock(&text_mutex); + } hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -1280,12 +1287,14 @@ static void __kprobes enable_all_kprobes(void) if (kprobe_enabled) goto already_enabled; + mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_gone(p)) arch_arm_kprobe(p); } + mutex_unlock(&text_mutex); kprobe_enabled = true; printk(KERN_INFO "Kprobes globally enabled\n"); @@ -1310,6 +1319,7 @@ static void __kprobes disable_all_kprobes(void) kprobe_enabled = false; printk(KERN_INFO "Kprobes globally disabled\n"); + mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { @@ -1318,6 +1328,7 @@ static void __kprobes disable_all_kprobes(void) } } + mutex_unlock(&text_mutex); mutex_unlock(&kprobe_mutex); /* Allow all currently running kprobes to complete */ synchronize_sched(); -- cgit v1.2.3-59-g8ed1b From 3945dab45aa8c89014893bfa8eb1e1661a409cef Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Mar 2009 10:37:22 -0500 Subject: tracing, Text Edit Lock - SMP alternatives support Use the mutual exclusion provided by the text edit lock in alternatives code. Since alternative_smp_* will be called from module init code, etc, we'd better protect it from other subsystems. Signed-off-by: Masami Hiramatsu LKML-Reference: <49B14332.9030109@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 4c80f1557433..092a7b8be68d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; + mutex_lock(&text_mutex); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -234,6 +236,7 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) /* turn DS segment override prefix into lock prefix */ text_poke(*ptr, ((unsigned char []){0xf0}), 1); }; + mutex_unlock(&text_mutex); } static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) @@ -243,6 +246,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end if (noreplace_smp) return; + mutex_lock(&text_mutex); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; @@ -251,6 +255,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end /* turn lock prefix into DS segment override prefix */ text_poke(*ptr, ((unsigned char []){0x3E}), 1); }; + mutex_unlock(&text_mutex); } struct smp_alt_module { -- cgit v1.2.3-59-g8ed1b From 78ff7fae04554b49d29226ed12536268c2500d1f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 6 Mar 2009 10:37:54 -0500 Subject: x86: implement atomic text_poke() via fixmap Use fixmaps instead of vmap/vunmap in text_poke() for avoiding page allocation and delayed unmapping. At the result of above change, text_poke() becomes atomic and can be called from stop_machine() etc. Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers LKML-Reference: <49B14352.2040705@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 2 ++ arch/x86/kernel/alternative.c | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 63a79c77d220..81937a5dc77c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -111,6 +111,8 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif + FIX_TEXT_POKE0, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE1, __end_of_permanent_fixed_addresses, #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT FIX_OHCI1394_BASE, diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 092a7b8be68d..2d903b760ddb 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #define MAX_PATCH_LEN (255-1) @@ -505,15 +507,16 @@ void *text_poke_early(void *addr, const void *opcode, size_t len) * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note: Must be called under text_mutex. */ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) { + unsigned long flags; char *vaddr; - int nr_pages = 2; struct page *pages[2]; int i; - might_sleep(); if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); @@ -523,14 +526,17 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) pages[1] = virt_to_page(addr + PAGE_SIZE); } BUG_ON(!pages[0]); - if (!pages[1]) - nr_pages = 1; - vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - BUG_ON(!vaddr); - local_irq_disable(); + set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); + if (pages[1]) + set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); + vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); + local_irq_save(flags); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_enable(); - vunmap(vaddr); + local_irq_restore(flags); + clear_fixmap(FIX_TEXT_POKE0); + if (pages[1]) + clear_fixmap(FIX_TEXT_POKE1); + local_flush_tlb(); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ -- cgit v1.2.3-59-g8ed1b From 4370aa4aa75391a5e2e06bccb0919109f725ed8e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 17:21:46 +0100 Subject: vsprintf: add binary printf Impact: add new APIs for binary trace printk infrastructure vbin_printf(): write args to binary buffer, string is copied when "%s" is occurred. bstr_printf(): read from binary buffer for args and format a string [fweisbec@gmail.com: rebase] Signed-off-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds LKML-Reference: <1236356510-8381-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/string.h | 7 + lib/Kconfig | 3 + lib/vsprintf.c | 442 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 452 insertions(+) diff --git a/include/linux/string.h b/include/linux/string.h index d18fc198aa2f..27ac31784ad2 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -10,6 +10,7 @@ #include /* for inline */ #include /* for size_t */ #include /* for NULL */ +#include extern char *strndup_user(const char __user *, long); @@ -111,6 +112,12 @@ extern void argv_free(char **argv); extern bool sysfs_streq(const char *s1, const char *s2); +#ifdef CONFIG_BINARY_PRINTF +int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); +int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf); +int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); +#endif + extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); diff --git a/lib/Kconfig b/lib/Kconfig index 03c2c24b9083..97d62cf091a7 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -2,6 +2,9 @@ # Library configuration # +config BINARY_PRINTF + def_bool n + menu "Library routines" config BITREVERSE diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0fbd0121d91d..3543bbe8b1bc 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1058,6 +1058,448 @@ int sprintf(char * buf, const char *fmt, ...) } EXPORT_SYMBOL(sprintf); +#ifdef CONFIG_BINARY_PRINTF +/* + * bprintf service: + * vbin_printf() - VA arguments to binary data + * bstr_printf() - Binary data to text string + */ + +/** + * vbin_printf - Parse a format string and place args' binary value in a buffer + * @bin_buf: The buffer to place args' binary value + * @size: The size of the buffer(by words(32bits), not characters) + * @fmt: The format string to use + * @args: Arguments for the format string + * + * The format follows C99 vsnprintf, except %n is ignored, and its argument + * is skiped. + * + * The return value is the number of words(32bits) which would be generated for + * the given input. + * + * NOTE: + * If the return value is greater than @size, the resulting bin_buf is NOT + * valid for bstr_printf(). + */ +int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) +{ + char *str, *end; + int qualifier; + + str = (char *)bin_buf; + end = (char *)(bin_buf + size); + +#define save_arg(type) \ +do { \ + if (sizeof(type) == 8) { \ + unsigned long long value; \ + str = PTR_ALIGN(str, sizeof(u32)); \ + value = va_arg(args, unsigned long long); \ + if (str + sizeof(type) <= end) { \ + *(u32 *)str = *(u32 *)&value; \ + *(u32 *)(str + 4) = *((u32 *)&value + 1); \ + } \ + } else { \ + unsigned long value; \ + str = PTR_ALIGN(str, sizeof(type)); \ + value = va_arg(args, int); \ + if (str + sizeof(type) <= end) \ + *(typeof(type) *)str = (type)value; \ + } \ + str += sizeof(type); \ +} while (0) + + for (; *fmt ; ++fmt) { + if (*fmt != '%') + continue; + +repeat: + /* parse flags */ + ++fmt; /* this also skips first '%' */ + if (*fmt == '-' || *fmt == '+' || *fmt == ' ' + || *fmt == '#' || *fmt == '0') + goto repeat; + + /* parse field width */ + if (isdigit(*fmt)) + skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + save_arg(int); + } + + /* parse the precision */ + if (*fmt == '.') { + ++fmt; + if (isdigit(*fmt)) + skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + save_arg(int); + } + } + + /* parse the conversion qualifier */ + qualifier = -1; + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || + *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { + qualifier = *fmt; + ++fmt; + if (qualifier == 'l' && *fmt == 'l') { + qualifier = 'L'; + ++fmt; + } + } + + /* parse format type */ + switch (*fmt) { + case 'c': + save_arg(char); + continue; + case 's': { + /* save the string argument */ + const char *save_str = va_arg(args, char *); + size_t len; + if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE + || (unsigned long)save_str < PAGE_SIZE) + save_str = ""; + len = strlen(save_str); + if (str + len + 1 < end) + memcpy(str, save_str, len + 1); + str += len + 1; + continue; + } + case 'p': + save_arg(void *); + /* skip all alphanumeric pointer suffixes */ + while (isalnum(fmt[1])) + fmt++; + continue; + case 'n': { + /* skip %n 's argument */ + void *skip_arg; + if (qualifier == 'l') + skip_arg = va_arg(args, long *); + else if (qualifier == 'Z' || qualifier == 'z') + skip_arg = va_arg(args, size_t *); + else + skip_arg = va_arg(args, int *); + continue; + } + case 'o': + case 'x': + case 'X': + case 'd': + case 'i': + case 'u': + /* save arg for case: 'o', 'x', 'X', 'd', 'i', 'u' */ + if (qualifier == 'L') + save_arg(long long); + else if (qualifier == 'l') + save_arg(unsigned long); + else if (qualifier == 'Z' || qualifier == 'z') + save_arg(size_t); + else if (qualifier == 't') + save_arg(ptrdiff_t); + else if (qualifier == 'h') + save_arg(short); + else + save_arg(int); + continue; + default: + if (!*fmt) + --fmt; + continue; + } + } +#undef save_arg + + return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; +} +EXPORT_SYMBOL_GPL(vbin_printf); + +/** + * bstr_printf - Format a string from binary arguments and place it in a buffer + * @buf: The buffer to place the result into + * @size: The size of the buffer, including the trailing null space + * @fmt: The format string to use + * @bin_buf: Binary arguments for the format string + * + * This function like C99 vsnprintf, but the difference is that vsnprintf gets + * arguments from stack, and bstr_printf gets arguments from @bin_buf which is + * a binary buffer that generated by vbin_printf. + * + * The format follows C99 vsnprintf, but has some extensions: + * %pS output the name of a text symbol + * %pF output the name of a function pointer + * %pR output the address range in a struct resource + * %n is ignored + * + * The return value is the number of characters which would + * be generated for the given input, excluding the trailing + * '\0', as per ISO C99. If you want to have the exact + * number of characters written into @buf as return value + * (not including the trailing '\0'), use vscnprintf(). If the + * return is greater than or equal to @size, the resulting + * string is truncated. + */ +int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) +{ + unsigned long long num; + int base; + char *str, *end, c; + const char *args = (const char *)bin_buf; + + int flags; + int field_width; + int precision; + int qualifier; + + if (unlikely((int) size < 0)) { + /* There can be only one.. */ + static char warn = 1; + WARN_ON(warn); + warn = 0; + return 0; + } + + str = buf; + end = buf + size; + +#define get_arg(type) \ +({ \ + typeof(type) value; \ + if (sizeof(type) == 8) { \ + args = PTR_ALIGN(args, sizeof(u32)); \ + *(u32 *)&value = *(u32 *)args; \ + *((u32 *)&value + 1) = *(u32 *)(args + 4); \ + } else { \ + args = PTR_ALIGN(args, sizeof(type)); \ + value = *(typeof(type) *)args; \ + } \ + args += sizeof(type); \ + value; \ +}) + + /* Make sure end is always >= buf */ + if (end < buf) { + end = ((void *)-1); + size = end - buf; + } + + for (; *fmt ; ++fmt) { + if (*fmt != '%') { + if (str < end) + *str = *fmt; + ++str; + continue; + } + + /* process flags */ + flags = 0; +repeat: + ++fmt; /* this also skips first '%' */ + switch (*fmt) { + case '-': + flags |= LEFT; + goto repeat; + case '+': + flags |= PLUS; + goto repeat; + case ' ': + flags |= SPACE; + goto repeat; + case '#': + flags |= SPECIAL; + goto repeat; + case '0': + flags |= ZEROPAD; + goto repeat; + } + + /* get field width */ + field_width = -1; + if (isdigit(*fmt)) + field_width = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + field_width = get_arg(int); + if (field_width < 0) { + field_width = -field_width; + flags |= LEFT; + } + } + + /* get the precision */ + precision = -1; + if (*fmt == '.') { + ++fmt; + if (isdigit(*fmt)) + precision = skip_atoi(&fmt); + else if (*fmt == '*') { + ++fmt; + /* it's the next argument */ + precision = get_arg(int); + } + if (precision < 0) + precision = 0; + } + + /* get the conversion qualifier */ + qualifier = -1; + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || + *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { + qualifier = *fmt; + ++fmt; + if (qualifier == 'l' && *fmt == 'l') { + qualifier = 'L'; + ++fmt; + } + } + + /* default base */ + base = 10; + + switch (*fmt) { + case 'c': + if (!(flags & LEFT)) { + while (--field_width > 0) { + if (str < end) + *str = ' '; + ++str; + } + } + c = (unsigned char) get_arg(char); + if (str < end) + *str = c; + ++str; + while (--field_width > 0) { + if (str < end) + *str = ' '; + ++str; + } + continue; + + case 's':{ + const char *str_arg = args; + size_t len = strlen(str_arg); + args += len + 1; + str = string(str, end, (char *)str_arg, field_width, + precision, flags); + continue; + } + + case 'p': + str = pointer(fmt+1, str, end, get_arg(void *), + field_width, precision, flags); + /* Skip all alphanumeric pointer suffixes */ + while (isalnum(fmt[1])) + fmt++; + continue; + + case 'n': + /* skip %n */ + continue; + + case '%': + if (str < end) + *str = '%'; + ++str; + continue; + + /* integer number formats - set up the flags and "break" */ + case 'o': + base = 8; + break; + + case 'x': + flags |= SMALL; + case 'X': + base = 16; + break; + + case 'd': + case 'i': + flags |= SIGN; + case 'u': + break; + + default: + if (str < end) + *str = '%'; + ++str; + if (*fmt) { + if (str < end) + *str = *fmt; + ++str; + } else { + --fmt; + } + continue; + } + if (qualifier == 'L') + num = get_arg(long long); + else if (qualifier == 'l') { + num = get_arg(unsigned long); + if (flags & SIGN) + num = (signed long) num; + } else if (qualifier == 'Z' || qualifier == 'z') { + num = get_arg(size_t); + } else if (qualifier == 't') { + num = get_arg(ptrdiff_t); + } else if (qualifier == 'h') { + num = (unsigned short) get_arg(short); + if (flags & SIGN) + num = (signed short) num; + } else { + num = get_arg(unsigned int); + if (flags & SIGN) + num = (signed int) num; + } + str = number(str, end, num, base, + field_width, precision, flags); + } + if (size > 0) { + if (str < end) + *str = '\0'; + else + end[-1] = '\0'; + } +#undef get_arg + + /* the trailing null byte doesn't count towards the total */ + return str - buf; +} +EXPORT_SYMBOL_GPL(bstr_printf); + +/** + * bprintf - Parse a format string and place args' binary value in a buffer + * @bin_buf: The buffer to place args' binary value + * @size: The size of the buffer(by words(32bits), not characters) + * @fmt: The format string to use + * @...: Arguments for the format string + * + * The function returns the number of words(u32) written + * into @bin_buf. + */ +int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = vbin_printf(bin_buf, size, fmt, args); + va_end(args); + return ret; +} +EXPORT_SYMBOL_GPL(bprintf); + +#endif /* CONFIG_BINARY_PRINTF */ + /** * vsscanf - Unformat a buffer into a list of arguments * @buf: input buffer -- cgit v1.2.3-59-g8ed1b From fef20d9c1380f04ba9492d6463148db07b413708 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Mar 2009 17:21:50 +0100 Subject: vsprintf: unify the format decoding layer for its 3 users An new optimization is making its way to ftrace. Its purpose is to make trace_printk() consuming less memory and be faster. Written by Lai Jiangshan, the approach is to delay the formatting job from tracing time to output time. Currently, a call to trace_printk() will format the whole string and insert it into the ring buffer. Then you can read it on /debug/tracing/trace file. The new implementation stores the address of the format string and the binary parameters into the ring buffer, making the packet more compact and faster to insert. Later, when the user exports the traces, the format string is retrieved with the binary parameters and the formatting job is eventually done. The new implementation rewrites a lot of format decoding bits from vsnprintf() function, making now 3 differents functions to maintain in their duplicated parts of printf format decoding bits. Suggested by Ingo Molnar, this patch tries to factorize the most possible common bits from these functions. The real common part between them is the format decoding. Although they do somewhat similar jobs, their way to export or import the parameters is very different. Thus, only the decoding layer is extracted, unless you see other parts that could be worth factorized. Changes in V2: - Address a suggestion from Linus to group the format_decode() parameters inside a structure. Changes in v3: - Address other cleanups suggested by Ingo and Linus such as passing the printf_spec struct to the format helpers: pointer()/number()/string() Note that this struct is passed by copy and not by address. This is to avoid side effects because these functions often change these values and the changes shoudn't be persistant when a callee helper returns. It would be too risky. - Various cleanups (code alignement, switch/case instead of if/else fountains). - Fix a bug that printed the first format specifier following a %p Changes in v4: - drop unapropriate const qualifier loss while casting fmt to a char * (thanks to Vegard Nossum for having pointed this out). Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 972 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 541 insertions(+), 431 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 3543bbe8b1bc..25f01578c856 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -396,7 +396,38 @@ static noinline char* put_dec(char *buf, unsigned long long num) #define SMALL 32 /* Must be 32 == 0x20 */ #define SPECIAL 64 /* 0x */ -static char *number(char *buf, char *end, unsigned long long num, int base, int size, int precision, int type) +enum format_type { + FORMAT_TYPE_NONE, /* Just a string part */ + FORMAT_TYPE_WITDH, + FORMAT_TYPE_PRECISION, + FORMAT_TYPE_CHAR, + FORMAT_TYPE_STR, + FORMAT_TYPE_PTR, + FORMAT_TYPE_PERCENT_CHAR, + FORMAT_TYPE_INVALID, + FORMAT_TYPE_LONG_LONG, + FORMAT_TYPE_ULONG, + FORMAT_TYPE_LONG, + FORMAT_TYPE_USHORT, + FORMAT_TYPE_SHORT, + FORMAT_TYPE_UINT, + FORMAT_TYPE_INT, + FORMAT_TYPE_NRCHARS, + FORMAT_TYPE_SIZE_T, + FORMAT_TYPE_PTRDIFF +}; + +struct printf_spec { + enum format_type type; + int flags; /* flags to number() */ + int field_width; /* width of output field */ + int base; + int precision; /* # of digits/chars */ + int qualifier; +}; + +static char *number(char *buf, char *end, unsigned long long num, + struct printf_spec spec) { /* we are called with base 8, 10 or 16, only, thus don't need "G..." */ static const char digits[16] = "0123456789ABCDEF"; /* "GHIJKLMNOPQRSTUVWXYZ"; */ @@ -404,32 +435,32 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int char tmp[66]; char sign; char locase; - int need_pfx = ((type & SPECIAL) && base != 10); + int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ - locase = (type & SMALL); - if (type & LEFT) - type &= ~ZEROPAD; + locase = (spec.flags & SMALL); + if (spec.flags & LEFT) + spec.flags &= ~ZEROPAD; sign = 0; - if (type & SIGN) { + if (spec.flags & SIGN) { if ((signed long long) num < 0) { sign = '-'; num = - (signed long long) num; - size--; - } else if (type & PLUS) { + spec.field_width--; + } else if (spec.flags & PLUS) { sign = '+'; - size--; - } else if (type & SPACE) { + spec.field_width--; + } else if (spec.flags & SPACE) { sign = ' '; - size--; + spec.field_width--; } } if (need_pfx) { - size--; - if (base == 16) - size--; + spec.field_width--; + if (spec.base == 16) + spec.field_width--; } /* generate full string in tmp[], in reverse order */ @@ -441,10 +472,10 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int tmp[i++] = (digits[do_div(num,base)] | locase); } while (num != 0); */ - else if (base != 10) { /* 8 or 16 */ - int mask = base - 1; + else if (spec.base != 10) { /* 8 or 16 */ + int mask = spec.base - 1; int shift = 3; - if (base == 16) shift = 4; + if (spec.base == 16) shift = 4; do { tmp[i++] = (digits[((unsigned char)num) & mask] | locase); num >>= shift; @@ -454,12 +485,12 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int } /* printing 100 using %2d gives "100", not "00" */ - if (i > precision) - precision = i; + if (i > spec.precision) + spec.precision = i; /* leading space padding */ - size -= precision; - if (!(type & (ZEROPAD+LEFT))) { - while(--size >= 0) { + spec.field_width -= spec.precision; + if (!(spec.flags & (ZEROPAD+LEFT))) { + while(--spec.field_width >= 0) { if (buf < end) *buf = ' '; ++buf; @@ -476,23 +507,23 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int if (buf < end) *buf = '0'; ++buf; - if (base == 16) { + if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ - if (!(type & LEFT)) { - char c = (type & ZEROPAD) ? '0' : ' '; - while (--size >= 0) { + if (!(spec.flags & LEFT)) { + char c = (spec.flags & ZEROPAD) ? '0' : ' '; + while (--spec.field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ - while (i <= --precision) { + while (i <= --spec.precision) { if (buf < end) *buf = '0'; ++buf; @@ -504,7 +535,7 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int ++buf; } /* trailing space padding */ - while (--size >= 0) { + while (--spec.field_width >= 0) { if (buf < end) *buf = ' '; ++buf; @@ -512,17 +543,17 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int return buf; } -static char *string(char *buf, char *end, char *s, int field_width, int precision, int flags) +static char *string(char *buf, char *end, char *s, struct printf_spec spec) { int len, i; if ((unsigned long)s < PAGE_SIZE) s = ""; - len = strnlen(s, precision); + len = strnlen(s, spec.precision); - if (!(flags & LEFT)) { - while (len < field_width--) { + if (!(spec.flags & LEFT)) { + while (len < spec.field_width--) { if (buf < end) *buf = ' '; ++buf; @@ -533,7 +564,7 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio *buf = *s; ++buf; ++s; } - while (len < field_width--) { + while (len < spec.field_width--) { if (buf < end) *buf = ' '; ++buf; @@ -541,21 +572,24 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) +static char *symbol_string(char *buf, char *end, void *ptr, + struct printf_spec spec) { unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; sprint_symbol(sym, value); - return string(buf, end, sym, field_width, precision, flags); + return string(buf, end, sym, spec); #else - field_width = 2*sizeof(void *); - flags |= SPECIAL | SMALL | ZEROPAD; - return number(buf, end, value, 16, field_width, precision, flags); + spec.field_width = 2*sizeof(void *); + spec.flags |= SPECIAL | SMALL | ZEROPAD; + spec.base = 16; + return number(buf, end, value, spec); #endif } -static char *resource_string(char *buf, char *end, struct resource *res, int field_width, int precision, int flags) +static char *resource_string(char *buf, char *end, struct resource *res, + struct printf_spec spec) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 4 @@ -564,7 +598,11 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 8 #endif - + struct printf_spec num_spec = { + .base = 16, + .precision = -1, + .flags = SPECIAL | SMALL | ZEROPAD, + }; /* room for the actual numbers, the two "0x", -, [, ] and the final zero */ char sym[4*sizeof(resource_size_t) + 8]; char *p = sym, *pend = sym + sizeof(sym); @@ -576,17 +614,18 @@ static char *resource_string(char *buf, char *end, struct resource *res, int fie size = MEM_RSRC_PRINTK_SIZE; *p++ = '['; - p = number(p, pend, res->start, 16, size, -1, SPECIAL | SMALL | ZEROPAD); + num_spec.field_width = size; + p = number(p, pend, res->start, num_spec); *p++ = '-'; - p = number(p, pend, res->end, 16, size, -1, SPECIAL | SMALL | ZEROPAD); + p = number(p, pend, res->end, num_spec); *p++ = ']'; *p = 0; - return string(buf, end, sym, field_width, precision, flags); + return string(buf, end, sym, spec); } -static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width, - int precision, int flags) +static char *mac_address_string(char *buf, char *end, u8 *addr, + struct printf_spec spec) { char mac_addr[6 * 3]; /* (6 * 2 hex digits), 5 colons and trailing zero */ char *p = mac_addr; @@ -594,16 +633,17 @@ static char *mac_address_string(char *buf, char *end, u8 *addr, int field_width, for (i = 0; i < 6; i++) { p = pack_hex_byte(p, addr[i]); - if (!(flags & SPECIAL) && i != 5) + if (!(spec.flags & SPECIAL) && i != 5) *p++ = ':'; } *p = '\0'; + spec.flags &= ~SPECIAL; - return string(buf, end, mac_addr, field_width, precision, flags & ~SPECIAL); + return string(buf, end, mac_addr, spec); } -static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width, - int precision, int flags) +static char *ip6_addr_string(char *buf, char *end, u8 *addr, + struct printf_spec spec) { char ip6_addr[8 * 5]; /* (8 * 4 hex digits), 7 colons and trailing zero */ char *p = ip6_addr; @@ -612,16 +652,17 @@ static char *ip6_addr_string(char *buf, char *end, u8 *addr, int field_width, for (i = 0; i < 8; i++) { p = pack_hex_byte(p, addr[2 * i]); p = pack_hex_byte(p, addr[2 * i + 1]); - if (!(flags & SPECIAL) && i != 7) + if (!(spec.flags & SPECIAL) && i != 7) *p++ = ':'; } *p = '\0'; + spec.flags &= ~SPECIAL; - return string(buf, end, ip6_addr, field_width, precision, flags & ~SPECIAL); + return string(buf, end, ip6_addr, spec); } -static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width, - int precision, int flags) +static char *ip4_addr_string(char *buf, char *end, u8 *addr, + struct printf_spec spec) { char ip4_addr[4 * 4]; /* (4 * 3 decimal digits), 3 dots and trailing zero */ char temp[3]; /* hold each IP quad in reverse order */ @@ -637,8 +678,9 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width, *p++ = '.'; } *p = '\0'; + spec.flags &= ~SPECIAL; - return string(buf, end, ip4_addr, field_width, precision, flags & ~SPECIAL); + return string(buf, end, ip4_addr, spec); } /* @@ -663,41 +705,234 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, int field_width, * function pointers are really function descriptors, which contain a * pointer to the real address. */ -static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags) +static char *pointer(const char *fmt, char *buf, char *end, void *ptr, + struct printf_spec spec) { if (!ptr) - return string(buf, end, "(null)", field_width, precision, flags); + return string(buf, end, "(null)", spec); switch (*fmt) { case 'F': ptr = dereference_function_descriptor(ptr); /* Fallthrough */ case 'S': - return symbol_string(buf, end, ptr, field_width, precision, flags); + return symbol_string(buf, end, ptr, spec); case 'R': - return resource_string(buf, end, ptr, field_width, precision, flags); + return resource_string(buf, end, ptr, spec); case 'm': - flags |= SPECIAL; + spec.flags |= SPECIAL; /* Fallthrough */ case 'M': - return mac_address_string(buf, end, ptr, field_width, precision, flags); + return mac_address_string(buf, end, ptr, spec); case 'i': - flags |= SPECIAL; + spec.flags |= SPECIAL; /* Fallthrough */ case 'I': if (fmt[1] == '6') - return ip6_addr_string(buf, end, ptr, field_width, precision, flags); + return ip6_addr_string(buf, end, ptr, spec); if (fmt[1] == '4') - return ip4_addr_string(buf, end, ptr, field_width, precision, flags); - flags &= ~SPECIAL; + return ip4_addr_string(buf, end, ptr, spec); + spec.flags &= ~SPECIAL; + break; + } + spec.flags |= SMALL; + if (spec.field_width == -1) { + spec.field_width = 2*sizeof(void *); + spec.flags |= ZEROPAD; + } + spec.base = 16; + + return number(buf, end, (unsigned long) ptr, spec); +} + +/* + * Helper function to decode printf style format. + * Each call decode a token from the format and return the + * number of characters read (or likely the delta where it wants + * to go on the next call). + * The decoded token is returned through the parameters + * + * 'h', 'l', or 'L' for integer fields + * 'z' support added 23/7/1999 S.H. + * 'z' changed to 'Z' --davidm 1/25/99 + * 't' added for ptrdiff_t + * + * @fmt: the format string + * @type of the token returned + * @flags: various flags such as +, -, # tokens.. + * @field_width: overwritten width + * @base: base of the number (octal, hex, ...) + * @precision: precision of a number + * @qualifier: qualifier of a number (long, size_t, ...) + */ +static int format_decode(const char *fmt, struct printf_spec *spec) +{ + const char *start = fmt; + bool sign = false; + + /* we finished early by reading the field width */ + if (spec->type == FORMAT_TYPE_WITDH) { + if (spec->field_width < 0) { + spec->field_width = -spec->field_width; + spec->flags |= LEFT; + } + spec->type = FORMAT_TYPE_NONE; + goto precision; + } + + /* we finished early by reading the precision */ + if (spec->type == FORMAT_TYPE_PRECISION) { + if (spec->precision < 0) + spec->precision = 0; + + spec->type = FORMAT_TYPE_NONE; + goto qualifier; + } + + /* By default */ + spec->type = FORMAT_TYPE_NONE; + + for (; *fmt ; ++fmt) { + if (*fmt == '%') + break; + } + + /* Return the current non-format string */ + if (fmt != start || !*fmt) + return fmt - start; + + /* Process flags */ + spec->flags = 0; + + while (1) { /* this also skips first '%' */ + bool found = true; + + ++fmt; + + switch (*fmt) { + case '-': spec->flags |= LEFT; break; + case '+': spec->flags |= PLUS; break; + case ' ': spec->flags |= SPACE; break; + case '#': spec->flags |= SPECIAL; break; + case '0': spec->flags |= ZEROPAD; break; + default: found = false; + } + + if (!found) + break; + } + + /* get field width */ + spec->field_width = -1; + + if (isdigit(*fmt)) + spec->field_width = skip_atoi(&fmt); + else if (*fmt == '*') { + /* it's the next argument */ + spec->type = FORMAT_TYPE_WITDH; + return ++fmt - start; + } + +precision: + /* get the precision */ + spec->precision = -1; + if (*fmt == '.') { + ++fmt; + if (isdigit(*fmt)) { + spec->precision = skip_atoi(&fmt); + if (spec->precision < 0) + spec->precision = 0; + } else if (*fmt == '*') { + /* it's the next argument */ + spec->type = FORMAT_TYPE_WITDH; + return ++fmt - start; + } + } + +qualifier: + /* get the conversion qualifier */ + spec->qualifier = -1; + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || + *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { + spec->qualifier = *fmt; + ++fmt; + if (spec->qualifier == 'l' && *fmt == 'l') { + spec->qualifier = 'L'; + ++fmt; + } + } + + /* default base */ + spec->base = 10; + switch (*fmt) { + case 'c': + spec->type = FORMAT_TYPE_CHAR; + return ++fmt - start; + + case 's': + spec->type = FORMAT_TYPE_STR; + return ++fmt - start; + + case 'p': + spec->type = FORMAT_TYPE_PTR; + return fmt - start; + /* skip alnum */ + + case 'n': + spec->type = FORMAT_TYPE_NRCHARS; + return ++fmt - start; + + case '%': + spec->type = FORMAT_TYPE_PERCENT_CHAR; + return ++fmt - start; + + /* integer number formats - set up the flags and "break" */ + case 'o': + spec->base = 8; break; + + case 'x': + spec->flags |= SMALL; + + case 'X': + spec->base = 16; + break; + + case 'd': + case 'i': + sign = true; + case 'u': + break; + + default: + spec->type = FORMAT_TYPE_INVALID; + return fmt - start; } - flags |= SMALL; - if (field_width == -1) { - field_width = 2*sizeof(void *); - flags |= ZEROPAD; + + if (spec->qualifier == 'L') + spec->type = FORMAT_TYPE_LONG_LONG; + else if (spec->qualifier == 'l') { + if (sign) + spec->type = FORMAT_TYPE_LONG; + else + spec->type = FORMAT_TYPE_ULONG; + } else if (spec->qualifier == 'Z' || spec->qualifier == 'z') { + spec->type = FORMAT_TYPE_SIZE_T; + } else if (spec->qualifier == 't') { + spec->type = FORMAT_TYPE_PTRDIFF; + } else if (spec->qualifier == 'h') { + if (sign) + spec->type = FORMAT_TYPE_SHORT; + else + spec->type = FORMAT_TYPE_USHORT; + } else { + if (sign) + spec->type = FORMAT_TYPE_INT; + else + spec->type = FORMAT_TYPE_UINT; } - return number(buf, end, (unsigned long) ptr, 16, field_width, precision, flags); + + return ++fmt - start; } /** @@ -726,18 +961,9 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) { unsigned long long num; - int base; char *str, *end, c; - - int flags; /* flags to number() */ - - int field_width; /* width of output field */ - int precision; /* min. # of digits for integers; max - number of chars for from string */ - int qualifier; /* 'h', 'l', or 'L' for integer fields */ - /* 'z' support added 23/7/1999 S.H. */ - /* 'z' changed to 'Z' --davidm 1/25/99 */ - /* 't' added for ptrdiff_t */ + int read; + struct printf_spec spec = {0}; /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ @@ -758,184 +984,144 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) size = end - buf; } - for (; *fmt ; ++fmt) { - if (*fmt != '%') { - if (str < end) - *str = *fmt; - ++str; - continue; - } + while (*fmt) { + const char *old_fmt = fmt; - /* process flags */ - flags = 0; - repeat: - ++fmt; /* this also skips first '%' */ - switch (*fmt) { - case '-': flags |= LEFT; goto repeat; - case '+': flags |= PLUS; goto repeat; - case ' ': flags |= SPACE; goto repeat; - case '#': flags |= SPECIAL; goto repeat; - case '0': flags |= ZEROPAD; goto repeat; - } + read = format_decode(fmt, &spec); - /* get field width */ - field_width = -1; - if (isdigit(*fmt)) - field_width = skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - field_width = va_arg(args, int); - if (field_width < 0) { - field_width = -field_width; - flags |= LEFT; - } - } + fmt += read; - /* get the precision */ - precision = -1; - if (*fmt == '.') { - ++fmt; - if (isdigit(*fmt)) - precision = skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - precision = va_arg(args, int); + switch (spec.type) { + case FORMAT_TYPE_NONE: { + int copy = read; + if (str < end) { + if (copy > end - str) + copy = end - str; + memcpy(str, old_fmt, copy); } - if (precision < 0) - precision = 0; + str += read; + break; } - /* get the conversion qualifier */ - qualifier = -1; - if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || - *fmt =='Z' || *fmt == 'z' || *fmt == 't') { - qualifier = *fmt; - ++fmt; - if (qualifier == 'l' && *fmt == 'l') { - qualifier = 'L'; - ++fmt; - } - } + case FORMAT_TYPE_WITDH: + spec.field_width = va_arg(args, int); + break; - /* default base */ - base = 10; + case FORMAT_TYPE_PRECISION: + spec.precision = va_arg(args, int); + break; - switch (*fmt) { - case 'c': - if (!(flags & LEFT)) { - while (--field_width > 0) { - if (str < end) - *str = ' '; - ++str; - } - } - c = (unsigned char) va_arg(args, int); - if (str < end) - *str = c; - ++str; - while (--field_width > 0) { + case FORMAT_TYPE_CHAR: + if (!(spec.flags & LEFT)) { + while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; - } - continue; - - case 's': - str = string(str, end, va_arg(args, char *), field_width, precision, flags); - continue; - - case 'p': - str = pointer(fmt+1, str, end, - va_arg(args, void *), - field_width, precision, flags); - /* Skip all alphanumeric pointer suffixes */ - while (isalnum(fmt[1])) - fmt++; - continue; - - case 'n': - /* FIXME: - * What does C99 say about the overflow case here? */ - if (qualifier == 'l') { - long * ip = va_arg(args, long *); - *ip = (str - buf); - } else if (qualifier == 'Z' || qualifier == 'z') { - size_t * ip = va_arg(args, size_t *); - *ip = (str - buf); - } else { - int * ip = va_arg(args, int *); - *ip = (str - buf); - } - continue; - case '%': + } + } + c = (unsigned char) va_arg(args, int); + if (str < end) + *str = c; + ++str; + while (--spec.field_width > 0) { if (str < end) - *str = '%'; + *str = ' '; ++str; - continue; + } + break; - /* integer number formats - set up the flags and "break" */ - case 'o': - base = 8; - break; + case FORMAT_TYPE_STR: + str = string(str, end, va_arg(args, char *), spec); + break; - case 'x': - flags |= SMALL; - case 'X': - base = 16; - break; + case FORMAT_TYPE_PTR: + str = pointer(fmt+1, str, end, va_arg(args, void *), + spec); + while (isalnum(*fmt)) + fmt++; + break; - case 'd': - case 'i': - flags |= SIGN; - case 'u': - break; + case FORMAT_TYPE_PERCENT_CHAR: + if (str < end) + *str = '%'; + ++str; + break; - default: + case FORMAT_TYPE_INVALID: + if (str < end) + *str = '%'; + ++str; + if (*fmt) { if (str < end) - *str = '%'; + *str = *fmt; ++str; - if (*fmt) { - if (str < end) - *str = *fmt; - ++str; - } else { - --fmt; - } - continue; + } else { + --fmt; + } + break; + + case FORMAT_TYPE_NRCHARS: { + int qualifier = spec.qualifier; + + if (qualifier == 'l') { + long *ip = va_arg(args, long *); + *ip = (str - buf); + } else if (qualifier == 'Z' || + qualifier == 'z') { + size_t *ip = va_arg(args, size_t *); + *ip = (str - buf); + } else { + int *ip = va_arg(args, int *); + *ip = (str - buf); + } + break; } - if (qualifier == 'L') - num = va_arg(args, long long); - else if (qualifier == 'l') { - num = va_arg(args, unsigned long); - if (flags & SIGN) - num = (signed long) num; - } else if (qualifier == 'Z' || qualifier == 'z') { - num = va_arg(args, size_t); - } else if (qualifier == 't') { - num = va_arg(args, ptrdiff_t); - } else if (qualifier == 'h') { - num = (unsigned short) va_arg(args, int); - if (flags & SIGN) - num = (signed short) num; - } else { - num = va_arg(args, unsigned int); - if (flags & SIGN) - num = (signed int) num; + + default: + switch (spec.type) { + case FORMAT_TYPE_LONG_LONG: + num = va_arg(args, long long); + break; + case FORMAT_TYPE_ULONG: + num = va_arg(args, unsigned long); + break; + case FORMAT_TYPE_LONG: + num = va_arg(args, long); + break; + case FORMAT_TYPE_SIZE_T: + num = va_arg(args, size_t); + break; + case FORMAT_TYPE_PTRDIFF: + num = va_arg(args, ptrdiff_t); + break; + case FORMAT_TYPE_USHORT: + num = (unsigned short) va_arg(args, int); + break; + case FORMAT_TYPE_SHORT: + num = (short) va_arg(args, int); + break; + case FORMAT_TYPE_UINT: + num = va_arg(args, unsigned int); + break; + default: + num = va_arg(args, unsigned int); + } + + str = number(str, end, num, spec); } - str = number(str, end, num, base, - field_width, precision, flags); } + if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } + /* the trailing null byte doesn't count towards the total */ return str-buf; + } EXPORT_SYMBOL(vsnprintf); @@ -1084,8 +1270,9 @@ EXPORT_SYMBOL(sprintf); */ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args) { + struct printf_spec spec = {0}; char *str, *end; - int qualifier; + int read; str = (char *)bin_buf; end = (char *)(bin_buf + size); @@ -1110,57 +1297,26 @@ do { \ str += sizeof(type); \ } while (0) - for (; *fmt ; ++fmt) { - if (*fmt != '%') - continue; -repeat: - /* parse flags */ - ++fmt; /* this also skips first '%' */ - if (*fmt == '-' || *fmt == '+' || *fmt == ' ' - || *fmt == '#' || *fmt == '0') - goto repeat; + while (*fmt) { + read = format_decode(fmt, &spec); - /* parse field width */ - if (isdigit(*fmt)) - skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - save_arg(int); - } + fmt += read; - /* parse the precision */ - if (*fmt == '.') { - ++fmt; - if (isdigit(*fmt)) - skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - save_arg(int); - } - } + switch (spec.type) { + case FORMAT_TYPE_NONE: + break; - /* parse the conversion qualifier */ - qualifier = -1; - if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || - *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { - qualifier = *fmt; - ++fmt; - if (qualifier == 'l' && *fmt == 'l') { - qualifier = 'L'; - ++fmt; - } - } + case FORMAT_TYPE_WITDH: + case FORMAT_TYPE_PRECISION: + save_arg(int); + break; - /* parse format type */ - switch (*fmt) { - case 'c': + case FORMAT_TYPE_CHAR: save_arg(char); - continue; - case 's': { - /* save the string argument */ + break; + + case FORMAT_TYPE_STR: { const char *save_str = va_arg(args, char *); size_t len; if ((unsigned long)save_str > (unsigned long)-PAGE_SIZE @@ -1170,16 +1326,27 @@ repeat: if (str + len + 1 < end) memcpy(str, save_str, len + 1); str += len + 1; - continue; + break; } - case 'p': + + case FORMAT_TYPE_PTR: save_arg(void *); /* skip all alphanumeric pointer suffixes */ - while (isalnum(fmt[1])) + while (isalnum(*fmt)) fmt++; - continue; - case 'n': { + break; + + case FORMAT_TYPE_PERCENT_CHAR: + break; + + case FORMAT_TYPE_INVALID: + if (!*fmt) + --fmt; + break; + + case FORMAT_TYPE_NRCHARS: { /* skip %n 's argument */ + int qualifier = spec.qualifier; void *skip_arg; if (qualifier == 'l') skip_arg = va_arg(args, long *); @@ -1187,37 +1354,37 @@ repeat: skip_arg = va_arg(args, size_t *); else skip_arg = va_arg(args, int *); - continue; + break; } - case 'o': - case 'x': - case 'X': - case 'd': - case 'i': - case 'u': - /* save arg for case: 'o', 'x', 'X', 'd', 'i', 'u' */ - if (qualifier == 'L') + + default: + switch (spec.type) { + + case FORMAT_TYPE_LONG_LONG: save_arg(long long); - else if (qualifier == 'l') + break; + case FORMAT_TYPE_ULONG: + case FORMAT_TYPE_LONG: save_arg(unsigned long); - else if (qualifier == 'Z' || qualifier == 'z') + break; + case FORMAT_TYPE_SIZE_T: save_arg(size_t); - else if (qualifier == 't') + break; + case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); - else if (qualifier == 'h') + break; + case FORMAT_TYPE_USHORT: + case FORMAT_TYPE_SHORT: save_arg(short); - else + break; + default: save_arg(int); - continue; - default: - if (!*fmt) - --fmt; - continue; + } } } -#undef save_arg - return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; + +#undef save_arg } EXPORT_SYMBOL_GPL(vbin_printf); @@ -1249,14 +1416,10 @@ EXPORT_SYMBOL_GPL(vbin_printf); int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) { unsigned long long num; - int base; char *str, *end, c; const char *args = (const char *)bin_buf; - int flags; - int field_width; - int precision; - int qualifier; + struct printf_spec spec = {0}; if (unlikely((int) size < 0)) { /* There can be only one.. */ @@ -1290,84 +1453,37 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) size = end - buf; } - for (; *fmt ; ++fmt) { - if (*fmt != '%') { - if (str < end) - *str = *fmt; - ++str; - continue; - } + while (*fmt) { + int read; + const char *old_fmt = fmt; - /* process flags */ - flags = 0; -repeat: - ++fmt; /* this also skips first '%' */ - switch (*fmt) { - case '-': - flags |= LEFT; - goto repeat; - case '+': - flags |= PLUS; - goto repeat; - case ' ': - flags |= SPACE; - goto repeat; - case '#': - flags |= SPECIAL; - goto repeat; - case '0': - flags |= ZEROPAD; - goto repeat; - } + read = format_decode(fmt, &spec); - /* get field width */ - field_width = -1; - if (isdigit(*fmt)) - field_width = skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - field_width = get_arg(int); - if (field_width < 0) { - field_width = -field_width; - flags |= LEFT; - } - } + fmt += read; - /* get the precision */ - precision = -1; - if (*fmt == '.') { - ++fmt; - if (isdigit(*fmt)) - precision = skip_atoi(&fmt); - else if (*fmt == '*') { - ++fmt; - /* it's the next argument */ - precision = get_arg(int); + switch (spec.type) { + case FORMAT_TYPE_NONE: { + int copy = read; + if (str < end) { + if (copy > end - str) + copy = end - str; + memcpy(str, old_fmt, copy); } - if (precision < 0) - precision = 0; + str += read; + break; } - /* get the conversion qualifier */ - qualifier = -1; - if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || - *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { - qualifier = *fmt; - ++fmt; - if (qualifier == 'l' && *fmt == 'l') { - qualifier = 'L'; - ++fmt; - } - } + case FORMAT_TYPE_WITDH: + spec.field_width = get_arg(int); + break; - /* default base */ - base = 10; + case FORMAT_TYPE_PRECISION: + spec.precision = get_arg(int); + break; - switch (*fmt) { - case 'c': - if (!(flags & LEFT)) { - while (--field_width > 0) { + case FORMAT_TYPE_CHAR: + if (!(spec.flags & LEFT)) { + while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; @@ -1377,58 +1493,34 @@ repeat: if (str < end) *str = c; ++str; - while (--field_width > 0) { + while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } - continue; + break; - case 's':{ + case FORMAT_TYPE_STR: { const char *str_arg = args; size_t len = strlen(str_arg); args += len + 1; - str = string(str, end, (char *)str_arg, field_width, - precision, flags); - continue; + str = string(str, end, (char *)str_arg, spec); + break; } - case 'p': - str = pointer(fmt+1, str, end, get_arg(void *), - field_width, precision, flags); - /* Skip all alphanumeric pointer suffixes */ - while (isalnum(fmt[1])) + case FORMAT_TYPE_PTR: + str = pointer(fmt+1, str, end, get_arg(void *), spec); + while (isalnum(*fmt)) fmt++; - continue; - - case 'n': - /* skip %n */ - continue; + break; - case '%': + case FORMAT_TYPE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; - continue; - - /* integer number formats - set up the flags and "break" */ - case 'o': - base = 8; - break; - - case 'x': - flags |= SMALL; - case 'X': - base = 16; break; - case 'd': - case 'i': - flags |= SIGN; - case 'u': - break; - - default: + case FORMAT_TYPE_INVALID: if (str < end) *str = '%'; ++str; @@ -1439,36 +1531,54 @@ repeat: } else { --fmt; } - continue; - } - if (qualifier == 'L') - num = get_arg(long long); - else if (qualifier == 'l') { - num = get_arg(unsigned long); - if (flags & SIGN) - num = (signed long) num; - } else if (qualifier == 'Z' || qualifier == 'z') { - num = get_arg(size_t); - } else if (qualifier == 't') { - num = get_arg(ptrdiff_t); - } else if (qualifier == 'h') { - num = (unsigned short) get_arg(short); - if (flags & SIGN) - num = (signed short) num; - } else { - num = get_arg(unsigned int); - if (flags & SIGN) - num = (signed int) num; + break; + + case FORMAT_TYPE_NRCHARS: + /* skip */ + break; + + default: + switch (spec.type) { + + case FORMAT_TYPE_LONG_LONG: + num = get_arg(long long); + break; + case FORMAT_TYPE_ULONG: + num = get_arg(unsigned long); + break; + case FORMAT_TYPE_LONG: + num = get_arg(unsigned long); + break; + case FORMAT_TYPE_SIZE_T: + num = get_arg(size_t); + break; + case FORMAT_TYPE_PTRDIFF: + num = get_arg(ptrdiff_t); + break; + case FORMAT_TYPE_USHORT: + num = get_arg(unsigned short); + break; + case FORMAT_TYPE_SHORT: + num = get_arg(short); + break; + case FORMAT_TYPE_UINT: + num = get_arg(unsigned int); + break; + default: + num = get_arg(int); + } + + str = number(str, end, num, spec); } - str = number(str, end, num, base, - field_width, precision, flags); } + if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } + #undef get_arg /* the trailing null byte doesn't count towards the total */ -- cgit v1.2.3-59-g8ed1b From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 17:21:47 +0100 Subject: tracing: infrastructure for supporting binary record Impact: save on memory for tracing Current tracers are typically using a struct(like struct ftrace_entry, struct ctx_switch_entry, struct special_entr etc...)to record a binary event. These structs can only record a their own kind of events. A new kind of tracer need a new struct and a lot of code too handle it. So we need a generic binary record for events. This infrastructure is for this purpose. [fweisbec@gmail.com: rebase against latest -tip, make it safe while sched tracing as reported by Steven Rostedt] Signed-off-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 3 ++ kernel/trace/Kconfig | 6 +++ kernel/trace/Makefile | 1 + kernel/trace/trace.c | 56 ++++++++++++++++++++++++++++ kernel/trace/trace.h | 12 ++++++ kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.c | 75 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 240 insertions(+) create mode 100644 kernel/trace/trace_bprintk.c diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 498769425eb2..1c9cdca02580 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); +#ifdef CONFIG_TRACE_BPRINTK +extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +#endif /* May be defined in arch */ extern int ftrace_arch_read_dyn_info(char *buf, int size); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 058d949a3214..ad8d3617d0a6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. +config TRACE_BPRINTK + bool "Binary printk for tracing" + default y + depends on TRACING + select BINARY_PRINTF + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index f44736c7574a..46557ef4c379 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o +obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6144acf2b75..ff53509e19f8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +/** + * trace_vbprintk - write binary msg to tracing buffer + * + * Caller must insure @fmt are valid when msg is in tracing buffer. + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + static DEFINE_SPINLOCK(trace_buf_lock); + static u32 trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + struct bprintk_entry *entry; + unsigned long flags; + int resched; + int cpu, len = 0, size, pc; + + if (tracing_disabled || !trace_bprintk_enable) + return 0; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + spin_lock_irqsave(&trace_buf_lock, flags); + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + if (len > TRACE_BUF_SIZE || len < 0) + goto out_unlock; + + size = sizeof(*entry) + sizeof(u32) * len; + event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, trace_buf, sizeof(u32) * len); + ring_buffer_unlock_commit(tr->buffer, event); + +out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + +out: + ftrace_preempt_enable(resched); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8beff03fda68..0f5077f8f957 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,6 +20,7 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, + TRACE_BPRINTK, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -124,6 +125,16 @@ struct print_entry { char buf[]; }; +struct bprintk_entry { + struct trace_entry ent; + unsigned long ip; + const char *fmt; + u32 buf[]; +}; +#ifdef CONFIG_TRACE_BPRINTK +extern int trace_bprintk_enable; +#endif + #define TRACE_OLD_SIZE 88 struct trace_field_cont { @@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c new file mode 100644 index 000000000000..1f8e532c3fb9 --- /dev/null +++ b/kernel/trace/trace_bprintk.c @@ -0,0 +1,87 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +/* binary printk basic */ +static DEFINE_MUTEX(btrace_mutex); +static int btrace_metadata_count; + +static inline void lock_btrace(void) +{ + mutex_lock(&btrace_mutex); +} + +static inline void unlock_btrace(void) +{ + mutex_unlock(&btrace_mutex); +} + +static void get_btrace_metadata(void) +{ + lock_btrace(); + btrace_metadata_count++; + unlock_btrace(); +} + +static void put_btrace_metadata(void) +{ + lock_btrace(); + btrace_metadata_count--; + unlock_btrace(); +} + +/* events tracer */ +int trace_bprintk_enable; + +static void start_bprintk_trace(struct trace_array *tr) +{ + get_btrace_metadata(); + tracing_reset_online_cpus(tr); + trace_bprintk_enable = 1; +} + +static void stop_bprintk_trace(struct trace_array *tr) +{ + trace_bprintk_enable = 0; + tracing_reset_online_cpus(tr); + put_btrace_metadata(); +} + +static int init_bprintk_trace(struct trace_array *tr) +{ + start_bprintk_trace(tr); + return 0; +} + +static struct tracer bprintk_trace __read_mostly = +{ + .name = "events", + .init = init_bprintk_trace, + .reset = stop_bprintk_trace, + .start = start_bprintk_trace, + .stop = stop_bprintk_trace, +}; + +static __init int init_bprintk(void) +{ + return register_tracer(&bprintk_trace); +} + +device_initcall(init_bprintk); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 306fef84c503..4ab71201862e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +static int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} + /** * trace_seq_puts - trace sequence printing of simple string * @s: trace sequence descriptor @@ -855,6 +875,60 @@ static struct trace_event trace_print_event = { .raw = trace_print_raw, }; +/* TRACE_BPRINTK */ +static enum print_line_t +trace_bprintk_print(struct trace_iterator *iter, int flags) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprintk_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t +trace_bprintk_raw(struct trace_iterator *iter, int flags) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprintk_entry *field; + + trace_assign_type(field, entry); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event trace_bprintk_event = { + .type = TRACE_BPRINTK, + .trace = trace_bprintk_print, + .raw = trace_bprintk_raw, + .hex = trace_nop_print, + .binary = trace_nop_print, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, @@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = { &trace_stack_event, &trace_user_stack_event, &trace_print_event, + &trace_bprintk_event, NULL }; -- cgit v1.2.3-59-g8ed1b From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 6 Mar 2009 17:21:48 +0100 Subject: tracing: add trace_bprintk() Impact: add a generic printk() for tracing, like trace_printk() trace_bprintk() uses the infrastructure to record events on ring_buffer. [ fweisbec@gmail.com: ported to latest -tip, made it work if !CONFIG_MODULES, never free the format strings from modules because we can't keep track of them and conditionnaly create the ftrace format strings section (reported by Steven Rostedt) ] Signed-off-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 9 ++++ include/linux/ftrace.h | 21 ++++++++++ include/linux/module.h | 5 +++ kernel/module.c | 6 +++ kernel/trace/trace.c | 15 +++++++ kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++----- 6 files changed, 133 insertions(+), 10 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0add6b28c366..48ade3168b13 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -69,6 +69,14 @@ #define FTRACE_EVENTS() #endif +#ifdef CONFIG_TRACING +#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .; \ + *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \ + VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .; +#else +#define TRACE_PRINTKS() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -100,6 +108,7 @@ *(__vermagic) /* Kernel version magic */ \ *(__markers_strings) /* Markers: strings */ \ *(__tracepoints_strings)/* Tracepoints: strings */ \ + TRACE_PRINTKS() \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1c9cdca02580..1cc8ca453a9b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -225,6 +225,27 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); #ifdef CONFIG_TRACE_BPRINTK extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +extern int __trace_bprintk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); + +static inline void ____trace_bprintk_check_format(const char *fmt, ...) + __attribute__ ((format (printf, 1, 2))); +static inline void ____trace_bprintk_check_format(const char *fmt, ...) {} +#define __trace_bprintk_check_format(fmt, args...) \ +do { \ + if (0) \ + ____trace_bprintk_check_format(fmt, ##args); \ +} while (0) + +#define trace_bprintk(fmt, args...) \ +do { \ + static char *__attribute__((section("__trace_bprintk_fmt"))) \ + trace_bprintk_fmt = fmt; \ + __trace_bprintk_check_format(fmt, ##args); \ + __trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args); \ +} while (0) +#else +#define trace_bprintk trace_printk #endif /* May be defined in arch */ diff --git a/include/linux/module.h b/include/linux/module.h index 145a75528cc1..8cbec972d8e7 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -329,6 +329,11 @@ struct module unsigned int num_tracepoints; #endif +#ifdef CONFIG_TRACE_BPRINTK + const char **trace_bprintk_fmt_start; + unsigned int num_trace_bprintk_fmt; +#endif + #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head modules_which_use_me; diff --git a/kernel/module.c b/kernel/module.c index 22d7379709da..2dece104f9a1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod, &mod->num_tracepoints); #endif +#ifdef CONFIG_TRACE_BPRINTK + mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings, + "__trace_bprintk_fmt", sizeof(char *), + &mod->num_trace_bprintk_fmt); +#endif + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ff53509e19f8..46b3cd7a5752 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3848,6 +3848,21 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); +int __trace_bprintk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!fmt) + return 0; + + va_start(ap, fmt); + ret = trace_vbprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bprintk); + static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c index 1f8e532c3fb9..f4c245a5cd33 100644 --- a/kernel/trace/trace_bprintk.c +++ b/kernel/trace/trace_bprintk.c @@ -19,9 +19,21 @@ #include "trace.h" +#ifdef CONFIG_MODULES + /* binary printk basic */ static DEFINE_MUTEX(btrace_mutex); -static int btrace_metadata_count; +/* + * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +struct trace_bprintk_fmt { + struct list_head list; + char fmt[0]; +}; + static inline void lock_btrace(void) { @@ -33,26 +45,75 @@ static inline void unlock_btrace(void) mutex_unlock(&btrace_mutex); } -static void get_btrace_metadata(void) + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { - lock_btrace(); - btrace_metadata_count++; - unlock_btrace(); + struct trace_bprintk_fmt *pos; + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; } -static void put_btrace_metadata(void) +static +void hold_module_trace_bprintk_format(const char **start, const char **end) { + const char **iter; lock_btrace(); - btrace_metadata_count--; + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + *iter = tb_fmt->fmt; + continue; + } + + tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) + + strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(tb_fmt->fmt, *iter); + *iter = tb_fmt->fmt; + } else + *iter = NULL; + } unlock_btrace(); } +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return 0; +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + /* events tracer */ int trace_bprintk_enable; static void start_bprintk_trace(struct trace_array *tr) { - get_btrace_metadata(); tracing_reset_online_cpus(tr); trace_bprintk_enable = 1; } @@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr) { trace_bprintk_enable = 0; tracing_reset_online_cpus(tr); - put_btrace_metadata(); } static int init_bprintk_trace(struct trace_array *tr) @@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly = static __init int init_bprintk(void) { - return register_tracer(&bprintk_trace); + int ret = register_module_notifier(&module_trace_bprintk_format_nb); + if (ret) + return ret; + + ret = register_tracer(&bprintk_trace); + if (ret) + unregister_module_notifier(&module_trace_bprintk_format_nb); + return ret; } device_initcall(init_bprintk); -- cgit v1.2.3-59-g8ed1b From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 6 Mar 2009 17:21:49 +0100 Subject: tracing/core: drop the old trace_printk() implementation in favour of trace_bprintk() Impact: faster and lighter tracing Now that we have trace_bprintk() which is faster and consume lesser memory than trace_printk() and has the same purpose, we can now drop the old implementation in favour of the binary one from trace_bprintk(), which means we move all the implementation of trace_bprintk() to trace_printk(), so the Api doesn't change except that we must now use trace_seq_bprintk() to print the TRACE_PRINT entries. Some changes result of this: - Previously, trace_bprintk depended of a single tracer and couldn't work without. This tracer has been dropped and the whole implementation of trace_printk() (like the module formats management) is now integrated in the tracing core (comes with CONFIG_TRACING), though we keep the file trace_printk (previously trace_bprintk.c) where we can find the module management. Thus we don't overflow trace.c - changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries. - change a bit trace_printk/trace_vprintk macros to support non-builtin formats constants, and fix 'const' qualifiers warnings. But this is all transparent for developers. - etc... V2: - Rebase against last changes - Fix mispell on the changelog V3: - Rebase against last changes (moving trace_printk() to kernel.h) Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 25 ----- include/linux/kernel.h | 34 +++++- include/linux/module.h | 2 +- kernel/trace/Kconfig | 7 +- kernel/trace/Makefile | 2 +- kernel/trace/trace.c | 212 ++++++++++------------------------- kernel/trace/trace.h | 14 +-- kernel/trace/trace_bprintk.c | 154 ------------------------- kernel/trace/trace_functions_graph.c | 6 +- kernel/trace/trace_mmiotrace.c | 9 +- kernel/trace/trace_output.c | 70 ++---------- kernel/trace/trace_output.h | 2 + kernel/trace/trace_printk.c | 138 +++++++++++++++++++++++ 13 files changed, 262 insertions(+), 413 deletions(-) delete mode 100644 kernel/trace/trace_bprintk.c create mode 100644 kernel/trace/trace_printk.c diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1cc8ca453a9b..e1583f2639b0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); -#ifdef CONFIG_TRACE_BPRINTK -extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int __trace_bprintk(unsigned long ip, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); - -static inline void ____trace_bprintk_check_format(const char *fmt, ...) - __attribute__ ((format (printf, 1, 2))); -static inline void ____trace_bprintk_check_format(const char *fmt, ...) {} -#define __trace_bprintk_check_format(fmt, args...) \ -do { \ - if (0) \ - ____trace_bprintk_check_format(fmt, ##args); \ -} while (0) - -#define trace_bprintk(fmt, args...) \ -do { \ - static char *__attribute__((section("__trace_bprintk_fmt"))) \ - trace_bprintk_fmt = fmt; \ - __trace_bprintk_check_format(fmt, ##args); \ - __trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args); \ -} while (0) -#else -#define trace_bprintk trace_printk -#endif - /* May be defined in arch */ extern int ftrace_arch_read_dyn_info(char *buf, int size); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aef15c4645e..4e726b9a71ec 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void); extern void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); +static inline void __attribute__ ((format (printf, 1, 2))) +____trace_printk_check_format(const char *fmt, ...) +{ +} +#define __trace_printk_check_format(fmt, args...) \ +do { \ + if (0) \ + ____trace_printk_check_format(fmt, ##args); \ +} while (0) + /** * trace_printk - printf formatting in the ftrace buffer * @fmt: the printf format for printing @@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); * Please refrain from leaving trace_printks scattered around in * your code. */ -# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt) + +#define trace_printk(fmt, args...) \ +do { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))); \ + trace_printk_fmt = fmt; \ + __trace_printk_check_format(fmt, ##args); \ + __trace_printk(_THIS_IP_, trace_printk_fmt, ##args); \ +} while (0) + extern int __trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); -# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap) + +#define ftrace_vprintk(fmt, vargs) \ +do { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))); \ + trace_printk_fmt = fmt; \ + __ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs); \ +} while (0) + extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); + extern void ftrace_dump(void); #else static inline void @@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap) return 0; } static inline void ftrace_dump(void) { } -#endif +#endif /* CONFIG_TRACING */ /* * Display an IP address in readable format. diff --git a/include/linux/module.h b/include/linux/module.h index 8cbec972d8e7..22d9878e868c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -329,7 +329,7 @@ struct module unsigned int num_tracepoints; #endif -#ifdef CONFIG_TRACE_BPRINTK +#ifdef CONFIG_TRACING const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ad8d3617d0a6..8e4a2a61cd75 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -52,6 +52,7 @@ config TRACING select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS select NOP_TRACER + select BINARY_PRINTF # # Minimum requirements an architecture has to meet for us to @@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. -config TRACE_BPRINTK - bool "Binary printk for tracing" - default y - depends on TRACING - select BINARY_PRINTF - config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 46557ef4c379..c7a2943796eb 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o -obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o +obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 46b3cd7a5752..cc94f8642485 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace) } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/** + * trace_vprintk - write binary msg to tracing buffer + * + */ +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +{ + static DEFINE_SPINLOCK(trace_buf_lock); + static u32 trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + struct print_entry *entry; + unsigned long flags; + int resched; + int cpu, len = 0, size, pc; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + spin_lock_irqsave(&trace_buf_lock, flags); + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + if (len > TRACE_BUF_SIZE || len < 0) + goto out_unlock; + + size = sizeof(*entry) + sizeof(u32) * len; + event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->depth = depth; + entry->fmt = fmt; + + memcpy(entry->buf, trace_buf, sizeof(u32) * len); + ring_buffer_unlock_commit(tr->buffer, event); + +out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + +out: + ftrace_preempt_enable(resched); + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vprintk); + enum trace_file_type { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, @@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%s", field->buf); + ret = trace_seq_bprintf(s, field->fmt, field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void) return 0; } -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) -{ - static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - - struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; - struct print_entry *entry; - unsigned long irq_flags; - - if (tracing_disabled || tracing_selftest_running) - return 0; - - pc = preempt_count(); - preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (unlikely(atomic_read(&data->disabled))) - goto out; - - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - __raw_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - len = min(len, TRACE_BUF_SIZE-1); - trace_buf[len] = 0; - - size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->depth = depth; - - memcpy(&entry->buf, trace_buf, len); - entry->buf[len] = 0; - ring_buffer_unlock_commit(tr->buffer, event); - - out_unlock: - __raw_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); - out: - preempt_enable_notrace(); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vprintk); - -int __trace_printk(unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_printk); - -int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) -{ - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); -} -EXPORT_SYMBOL_GPL(__ftrace_vprintk); - -/** - * trace_vbprintk - write binary msg to tracing buffer - * - * Caller must insure @fmt are valid when msg is in tracing buffer. - */ -int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) -{ - static DEFINE_SPINLOCK(trace_buf_lock); - static u32 trace_buf[TRACE_BUF_SIZE]; - - struct ring_buffer_event *event; - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - struct bprintk_entry *entry; - unsigned long flags; - int resched; - int cpu, len = 0, size, pc; - - if (tracing_disabled || !trace_bprintk_enable) - return 0; - - pc = preempt_count(); - resched = ftrace_preempt_disable(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (unlikely(atomic_read(&data->disabled))) - goto out; - - spin_lock_irqsave(&trace_buf_lock, flags); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); - - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; - - size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc); - if (!event) - goto out_unlock; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, trace_buf, sizeof(u32) * len); - ring_buffer_unlock_commit(tr->buffer, event); - -out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); - -out: - ftrace_preempt_enable(resched); - - return len; -} -EXPORT_SYMBOL_GPL(trace_vbprintk); - -int __trace_bprintk(unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!fmt) - return 0; - - va_start(ap, fmt); - ret = trace_vbprintk(ip, fmt, ap); - va_end(ap); - return ret; -} -EXPORT_SYMBOL_GPL(__trace_bprintk); - static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0f5077f8f957..6140922392c8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,7 +20,6 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, - TRACE_BPRINTK, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -120,16 +119,10 @@ struct userstack_entry { */ struct print_entry { struct trace_entry ent; - unsigned long ip; + unsigned long ip; int depth; - char buf[]; -}; - -struct bprintk_entry { - struct trace_entry ent; - unsigned long ip; - const char *fmt; - u32 buf[]; + const char *fmt; + u32 buf[]; }; #ifdef CONFIG_TRACE_BPRINTK extern int trace_bprintk_enable; @@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ - IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c deleted file mode 100644 index f4c245a5cd33..000000000000 --- a/kernel/trace/trace_bprintk.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * trace binary printk - * - * Copyright (C) 2008 Lai Jiangshan - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "trace.h" - -#ifdef CONFIG_MODULES - -/* binary printk basic */ -static DEFINE_MUTEX(btrace_mutex); -/* - * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt - * which are queued on trace_bprintk_fmt_list. - */ -static LIST_HEAD(trace_bprintk_fmt_list); - -struct trace_bprintk_fmt { - struct list_head list; - char fmt[0]; -}; - - -static inline void lock_btrace(void) -{ - mutex_lock(&btrace_mutex); -} - -static inline void unlock_btrace(void) -{ - mutex_unlock(&btrace_mutex); -} - - -static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) -{ - struct trace_bprintk_fmt *pos; - list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { - if (!strcmp(pos->fmt, fmt)) - return pos; - } - return NULL; -} - -static -void hold_module_trace_bprintk_format(const char **start, const char **end) -{ - const char **iter; - lock_btrace(); - for (iter = start; iter < end; iter++) { - struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); - if (tb_fmt) { - *iter = tb_fmt->fmt; - continue; - } - - tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) - + strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(tb_fmt->fmt, *iter); - *iter = tb_fmt->fmt; - } else - *iter = NULL; - } - unlock_btrace(); -} - -static int module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - if (mod->num_trace_bprintk_fmt) { - const char **start = mod->trace_bprintk_fmt_start; - const char **end = start + mod->num_trace_bprintk_fmt; - - if (val == MODULE_STATE_COMING) - hold_module_trace_bprintk_format(start, end); - } - return 0; -} - -#else /* !CONFIG_MODULES */ -__init static int -module_trace_bprintk_format_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} -#endif /* CONFIG_MODULES */ - - -__initdata_or_module static -struct notifier_block module_trace_bprintk_format_nb = { - .notifier_call = module_trace_bprintk_format_notify, -}; - -/* events tracer */ -int trace_bprintk_enable; - -static void start_bprintk_trace(struct trace_array *tr) -{ - tracing_reset_online_cpus(tr); - trace_bprintk_enable = 1; -} - -static void stop_bprintk_trace(struct trace_array *tr) -{ - trace_bprintk_enable = 0; - tracing_reset_online_cpus(tr); -} - -static int init_bprintk_trace(struct trace_array *tr) -{ - start_bprintk_trace(tr); - return 0; -} - -static struct tracer bprintk_trace __read_mostly = -{ - .name = "events", - .init = init_bprintk_trace, - .reset = stop_bprintk_trace, - .start = start_bprintk_trace, - .stop = stop_bprintk_trace, -}; - -static __init int init_bprintk(void) -{ - int ret = register_module_notifier(&module_trace_bprintk_format_nb); - if (ret) - return ret; - - ret = register_tracer(&bprintk_trace); - if (ret) - unregister_module_notifier(&module_trace_bprintk_format_nb); - return ret; -} - -device_initcall(init_bprintk); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e527f2f66c73..453ebd3b636e 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s, } /* The comment */ - ret = trace_seq_printf(s, "/* %s", trace->buf); + ret = trace_seq_printf(s, "/* "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_bprintf(s, trace->fmt, trace->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c401b908e805..23e346a734ca 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct print_entry *print = (struct print_entry *)entry; - const char *msg = print->buf; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_bprintf(s, print->fmt, print->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 4ab71201862e..ef8fd661b217 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } -static int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { int len = (PAGE_SIZE - 1) - s->len; int ret; @@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = { }; /* TRACE_PRINT */ -static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags) -{ - struct print_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) -{ - struct print_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; -} - -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, - .trace = trace_print_print, - .raw = trace_print_raw, -}; - -/* TRACE_BPRINTK */ static enum print_line_t -trace_bprintk_print(struct trace_iterator *iter, int flags) +trace_print_print(struct trace_iterator *iter, int flags) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; - struct bprintk_entry *field; + struct print_entry *field; trace_assign_type(field, entry); @@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t -trace_bprintk_raw(struct trace_iterator *iter, int flags) + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) { - struct trace_entry *entry = iter->ent; + struct print_entry *field; struct trace_seq *s = &iter->seq; - struct bprintk_entry *field; - trace_assign_type(field, entry); + trace_assign_type(field, iter->ent); if (!trace_seq_printf(s, ": %lx : ", field->ip)) goto partial; @@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_bprintk_event = { - .type = TRACE_BPRINTK, - .trace = trace_bprintk_print, - .raw = trace_bprintk_raw, - .hex = trace_nop_print, - .binary = trace_nop_print, + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .trace = trace_print_print, + .raw = trace_print_raw, }; static struct trace_event *events[] __initdata = { @@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = { &trace_stack_event, &trace_user_stack_event, &trace_print_event, - &trace_bprintk_event, NULL }; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 8a34d688ed63..3b90e6ade1aa 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -18,6 +18,8 @@ struct trace_event { extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c new file mode 100644 index 000000000000..a50aea22e929 --- /dev/null +++ b/kernel/trace/trace_printk.c @@ -0,0 +1,138 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#ifdef CONFIG_MODULES + +/* + * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +/* serialize accesses to trace_bprintk_fmt_list */ +static DEFINE_MUTEX(btrace_mutex); + +struct trace_bprintk_fmt { + struct list_head list; + char fmt[0]; +}; + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) +{ + struct trace_bprintk_fmt *pos; + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; +} + +static +void hold_module_trace_bprintk_format(const char **start, const char **end) +{ + const char **iter; + + mutex_lock(&btrace_mutex); + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + *iter = tb_fmt->fmt; + continue; + } + + tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) + + strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(tb_fmt->fmt, *iter); + *iter = tb_fmt->fmt; + } else + *iter = NULL; + } + mutex_unlock(&btrace_mutex); +} + +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return 0; +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + +int __trace_printk(unsigned long ip, const char *fmt, ...) + { + int ret; + va_list ap; + + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) + { + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + + +static __init int init_trace_printk(void) +{ + return register_module_notifier(&module_trace_bprintk_format_nb); +} + +early_initcall(init_trace_printk); -- cgit v1.2.3-59-g8ed1b From 9de36825b321fe9fe9cf73260554251af579f4ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 6 Mar 2009 17:52:03 +0100 Subject: tracing: trace_bprintk() cleanups Impact: cleanup Remove a few leftovers and clean up the code a bit. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/module.c | 6 ------ kernel/trace/trace.h | 19 ++++++++----------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 2dece104f9a1..22d7379709da 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2158,12 +2158,6 @@ static noinline struct module *load_module(void __user *umod, &mod->num_tracepoints); #endif -#ifdef CONFIG_TRACE_BPRINTK - mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings, - "__trace_bprintk_fmt", sizeof(char *), - &mod->num_trace_bprintk_fmt); -#endif - #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6140922392c8..2bfb7d11fc17 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -119,14 +119,11 @@ struct userstack_entry { */ struct print_entry { struct trace_entry ent; - unsigned long ip; + unsigned long ip; int depth; const char *fmt; - u32 buf[]; + u32 buf[]; }; -#ifdef CONFIG_TRACE_BPRINTK -extern int trace_bprintk_enable; -#endif #define TRACE_OLD_SIZE 88 @@ -199,7 +196,7 @@ struct kmemtrace_free_entry { * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled - * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags * NEED_RESCED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler @@ -302,7 +299,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ + IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ @@ -325,8 +322,8 @@ enum print_line_t { * flags value in struct tracer_flags. */ struct tracer_opt { - const char *name; /* Will appear on the trace_options file */ - u32 bit; /* Mask assigned in val field in tracer_flags */ + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ }; /* @@ -335,7 +332,7 @@ struct tracer_opt { */ struct tracer_flags { u32 val; - struct tracer_opt *opts; + struct tracer_opt *opts; }; /* Makes more easy to define a tracer opt */ @@ -390,7 +387,7 @@ struct tracer { int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; int print_max; - struct tracer_flags *flags; + struct tracer_flags *flags; struct tracer_stat *stats; }; -- cgit v1.2.3-59-g8ed1b From 24418492aa245e9812c425593883b9db52fd8d29 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Wed, 4 Mar 2009 19:33:48 +0100 Subject: xfs: move declaration to header file Fix this sparse warning: fs/xfs/xfs_da_btree.c:1550:26: warning: symbol 'xfs_default_nameops' was not declared. Should it be static? Signed-off-by: Hannes Eder Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/xfs_da_btree.h | 1 + fs/xfs/xfs_dir2.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 9d5a7e8c38fe..41f27b332048 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -268,5 +268,6 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); extern struct kmem_zone *xfs_da_state_zone; extern struct kmem_zone *xfs_dabuf_zone; +extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 1afb12278b8d..c657bec6d951 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -46,8 +46,6 @@ struct xfs_name xfs_name_dotdot = {"..", 2}; -extern const struct xfs_nameops xfs_default_nameops; - /* * ASCII case-insensitive (ie. A-Z) support for directories that was * used in IRIX. -- cgit v1.2.3-59-g8ed1b From 3180e66d77e3c34cb466188105eace05dfeb5681 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Wed, 4 Mar 2009 19:34:10 +0100 Subject: xfs: make symbols static Instead of the keyword 'static' the macro 'STATIC' is used, so the symbols are still global with CONFIG_XFS_DEBUG. Fix this sparse warnings: fs/xfs/linux-2.6/xfs_super.c:638:1: warning: symbol 'xfs_blkdev_get' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_super.c:655:1: warning: symbol 'xfs_blkdev_put' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_super.c:876:1: warning: symbol 'xfsaild' was not declared. Should it be static? fs/xfs/xfs_bmap.c:6208:1: warning: symbol 'xfs_check_block' was not declared. Should it be static? fs/xfs/xfs_dir2_leaf.c:553:1: warning: symbol 'xfs_dir2_leaf_check' was not declared. Should it be static? Signed-off-by: Hannes Eder Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_super.c | 6 +++--- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_dir2_leaf.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 8483b35821e0..9f974f4b3307 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -633,7 +633,7 @@ xfs_max_file_offset( return (((__uint64_t)pagefactor) << bitshift) - 1; } -int +STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, @@ -650,7 +650,7 @@ xfs_blkdev_get( return -error; } -void +STATIC void xfs_blkdev_put( struct block_device *bdev) { @@ -871,7 +871,7 @@ xfsaild_wakeup( wake_up_process(ailp->xa_task); } -int +STATIC int xfsaild( void *data) { diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c852cd65aaea..27062d0a879d 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6204,7 +6204,7 @@ xfs_bmap_get_bp( return(bp); } -void +STATIC void xfs_check_block( struct xfs_btree_block *block, xfs_mount_t *mp, diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index ef805a374eec..6427ff11089f 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -549,7 +549,7 @@ xfs_dir2_leaf_addname( * Check the internal consistency of a leaf1 block. * Pop an assert if something is wrong. */ -void +STATIC void xfs_dir2_leaf_check( xfs_inode_t *dp, /* incore directory inode */ xfs_dabuf_t *bp) /* leaf's buffer */ -- cgit v1.2.3-59-g8ed1b From 7bf446f8b581cef434f5ff05e8a791563bc09b7f Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Thu, 5 Mar 2009 15:20:25 +0100 Subject: xfs: include header files for prototypes Fix this sparse warnings: fs/xfs/linux-2.6/xfs_ioctl.c:72:1: warning: symbol 'xfs_find_handle' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:249:1: warning: symbol 'xfs_open_by_handle' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:361:1: warning: symbol 'xfs_readlink_by_handle' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:496:1: warning: symbol 'xfs_attrmulti_attr_get' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:525:1: warning: symbol 'xfs_attrmulti_attr_set' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:555:1: warning: symbol 'xfs_attrmulti_attr_remove' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:657:1: warning: symbol 'xfs_ioc_space' was not declared. Should it be static? fs/xfs/linux-2.6/xfs_ioctl.c:1340:1: warning: symbol 'xfs_file_ioctl' was not declared. Should it be static? fs/xfs/support/debug.c:65:1: warning: symbol 'xfs_fs_vcmn_err' was not declared. Should it be static? fs/xfs/support/debug.c:112:1: warning: symbol 'xfs_hex_dump' was not declared. Should it be static? Signed-off-by: Hannes Eder Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_ioctl.c | 1 + fs/xfs/support/debug.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 6f04493b8aba..d0b499418a7d 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -34,6 +34,7 @@ #include "xfs_dir2_sf.h" #include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_ioctl.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index ae5482965424..930bb34804b0 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -17,6 +17,7 @@ */ #include #include "debug.h" +#include "xfs_error.h" /* xfs_mount.h drags a lot of crap in, sorry.. */ #include "xfs_sb.h" -- cgit v1.2.3-59-g8ed1b From 3edaae7c5bda085b7dc704fe379f35b85e6f493e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 3 Mar 2009 19:22:53 +0200 Subject: UBIFS: improve find function interface Make 'ubifs_find_free_space()' return offset where free space starts, rather than the amount of free space. This is just more appropriat for its caller. Signed-off-by: Artem Bityutskiy --- fs/ubifs/find.c | 12 ++++++------ fs/ubifs/journal.c | 5 ++--- fs/ubifs/ubifs.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 717d79c97c5e..1d54383d1269 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * ubifs_find_free_space - find a data LEB with free space. * @c: the UBIFS file-system description object * @min_space: minimum amount of required free space - * @free: contains amount of free space in the LEB on exit + * @offs: contains offset of where free space starts on exit * @squeeze: whether to try to find space in a non-empty LEB first * * This function looks for an LEB with at least @min_space bytes of free space. @@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * failed to find a LEB with @min_space bytes of free space and other a negative * error codes in case of failure. */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze) { const struct ubifs_lprops *lprops; @@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, spin_unlock(&c->space_lock); } - *free = lprops->free; + *offs = c->leb_size - lprops->free; ubifs_release_lprops(c); - if (*free == c->leb_size) { + if (*offs == 0) { /* * Ensure that empty LEBs have been unmapped. They may not have * been, for example, because of an unclean unmount. Also @@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, return err; } - dbg_find("found LEB %d, free %d", lnum, *free); - ubifs_assert(*free >= min_space); + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); return lnum; out: diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index a11ca0958a23..a2d334eccbca 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) */ static int reserve_space(struct ubifs_info *c, int jhead, int len) { - int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; /* @@ -139,10 +139,9 @@ again: * Write buffer wasn't seek'ed or there is no enough space - look for an * LEB with some empty space. */ - lnum = ubifs_find_free_space(c, len, &free, squeeze); + lnum = ubifs_find_free_space(c, len, &offs, squeeze); if (lnum >= 0) { /* Found an LEB, add it to the journal head */ - offs = c->leb_size - free; err = ubifs_add_bud_to_log(c, jhead, lnum, offs); if (err) goto out_return; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 039a68bee29a..2da1193a381f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1500,7 +1500,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze); int ubifs_find_free_leb_for_idx(struct ubifs_info *c); int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, -- cgit v1.2.3-59-g8ed1b From cb4f952db3a01a2d56eb17e0eb00ce99ae5f0f50 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sat, 7 Mar 2009 20:53:41 +0200 Subject: UBIFS: amend key_hash return value ... which should be uint32_t, not int. Signed-off-by: Artem Bityutskiy --- fs/ubifs/key.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index efb3430a2581..5fa27ea031ba 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) * @c: UBIFS file-system description object * @key: the key to get hash from */ -static inline int key_hash(const struct ubifs_info *c, - const union ubifs_key *key) +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) { return key->u32[1] & UBIFS_S_KEY_HASH_MASK; } @@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c, * @c: UBIFS file-system description object * @k: the key to get hash from */ -static inline int key_hash_flash(const struct ubifs_info *c, const void *k) +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) { const union ubifs_key *key = k; -- cgit v1.2.3-59-g8ed1b From 42b40b3d55f5782b00b74d9105c3565fbfa5cb80 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sat, 7 Mar 2009 23:55:09 +0900 Subject: ftrace: fix documentation typo s/trace_max_latency/tracing_max_latency/ There isn't a trace_max_latency file, there is tracing_max_latency. Fix it. Signed-off-by: KOSAKI Motohiro Acked-by: Steven Rostedt LKML-Reference: <20090307235409.5A87.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/ftrace.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt index 22614bef6359..fd9a3e693813 100644 --- a/Documentation/ftrace.txt +++ b/Documentation/ftrace.txt @@ -100,7 +100,7 @@ of ftrace. Here is a list of some of the key files: that is displayed in one of the above output files. - trace_max_latency: + tracing_max_latency: Some of the tracers record the max latency. For example, the time interrupts are disabled. -- cgit v1.2.3-59-g8ed1b From 888b55dc314d26239d84c3b187dae555a81c1605 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Sun, 8 Mar 2009 13:12:43 +0900 Subject: ftrace: tracing header should put '#' at the beginning of a line In a recent discussion, Andrew Morton pointed out that tracing header should put '#' at the beginning of a line. Then, we can easily filtered the header by following grep usage: cat trace | grep -v '^#' Wakeup trace also has the same header problem. Comparison of headers displayed: before this patch: # tracer: wakeup # wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip -------------------------------------------------------------------- latency: 19059 us, #21277/21277, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) ----------------- | task: kondemand/1-1644 (uid:0 nice:-5 policy:0 rt_prio:0) ----------------- # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / # ||||| delay # cmd pid ||||| time | caller # \ / ||||| \ | / irqbalan-1887 1d.s. 0us : 1887:120:R + [001] 1644:115:S kondemand/1 irqbalan-1887 1d.s. 1us : default_wake_function <-autoremove_wake_function irqbalan-1887 1d.s. 2us : check_preempt_wakeup <-try_to_wake_up after this patch: # tracer: wakeup # # wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip # -------------------------------------------------------------------- # latency: 529 us, #530/530, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4) # ----------------- # | task: kondemand/0-1641 (uid:0 nice:-5 policy:0 rt_prio:0) # ----------------- # # _------=> CPU# # / _-----=> irqs-off # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / # ||||| delay # cmd pid ||||| time | caller # \ / ||||| \ | / sshd-2496 0d.s. 0us : 2496:120:R + [000] 1641:115:S kondemand/0 sshd-2496 0d.s. 1us : default_wake_function <-autoremove_wake_function sshd-2496 0d.s. 1us : check_preempt_wakeup <-try_to_wake_up Signed-off-by: KOSAKI Motohiro Cc: Andrew Morton LKML-Reference: <20090308124421.23C3.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cc94f8642485..e5b56199e5e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1466,11 +1466,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) total = entries + ring_buffer_overruns(iter->tr->buffer); - seq_printf(m, "%s latency trace v1.1.5 on %s\n", + seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); - seq_puts(m, "-----------------------------------" + seq_puts(m, "# -----------------------------------" "---------------------------------\n"); - seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" + seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" " (M:%s VP:%d, KP:%d, SP:%d HP:%d", nsecs_to_usecs(data->saved_latency), entries, @@ -1492,24 +1492,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) #else seq_puts(m, ")\n"); #endif - seq_puts(m, " -----------------\n"); - seq_printf(m, " | task: %.16s-%d " + seq_puts(m, "# -----------------\n"); + seq_printf(m, "# | task: %.16s-%d " "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", data->comm, data->pid, data->uid, data->nice, data->policy, data->rt_priority); - seq_puts(m, " -----------------\n"); + seq_puts(m, "# -----------------\n"); if (data->critical_start) { - seq_puts(m, " => started at: "); + seq_puts(m, "# => started at: "); seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); trace_print_seq(m, &iter->seq); - seq_puts(m, "\n => ended at: "); + seq_puts(m, "\n# => ended at: "); seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); trace_print_seq(m, &iter->seq); - seq_puts(m, "\n"); + seq_puts(m, "#\n"); } - seq_puts(m, "\n"); + seq_puts(m, "#\n"); } static void test_cpu_buff_start(struct trace_iterator *iter) -- cgit v1.2.3-59-g8ed1b From 8a20d84d09ab5d121f989cd99e4fc5f4b49f98ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Mar 2009 10:09:06 +0100 Subject: tracing: trace_printk() fix, move format array to data section Impact: fix kernel crash when using trace_printk() trace_printk_fmt section is defined into the readonly section. But we do: trace_printk_fmt = fmt; to fill in that table of format strings - which is not read-only. Under CONFIG_DEBUG_RODATA=y this crashes ... Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 48ade3168b13..d656b4624024 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -98,6 +98,7 @@ VMLINUX_SYMBOL(__stop___tracepoints) = .; \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ + TRACE_PRINTKS() \ FTRACE_EVENTS() #define RO_DATA(align) \ @@ -108,7 +109,6 @@ *(__vermagic) /* Kernel version magic */ \ *(__markers_strings) /* Markers: strings */ \ *(__tracepoints_strings)/* Tracepoints: strings */ \ - TRACE_PRINTKS() \ } \ \ .rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \ -- cgit v1.2.3-59-g8ed1b From 7bffc23e56e92c14b787bf4d95249a32085bfed5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Mar 2009 10:11:36 +0100 Subject: tracing: optimize trace_printk() Impact: micro-optimization trace_printk() does this unconditionally: trace_printk_fmt = fmt; Where trace_printk_fmt is an entry into a global array. This is very SMP-unfriendly. So only write it once per bootup. Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4e726b9a71ec..7742798c9208 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -454,7 +454,10 @@ do { \ do { \ static const char *trace_printk_fmt \ __attribute__((section("__trace_printk_fmt"))); \ - trace_printk_fmt = fmt; \ + \ + if (!trace_printk_fmt) \ + trace_printk_fmt = fmt; \ + \ __trace_printk_check_format(fmt, ##args); \ __trace_printk(_THIS_IP_, trace_printk_fmt, ##args); \ } while (0) @@ -467,7 +470,10 @@ __trace_printk(unsigned long ip, const char *fmt, ...) do { \ static const char *trace_printk_fmt \ __attribute__((section("__trace_printk_fmt"))); \ - trace_printk_fmt = fmt; \ + \ + if (!trace_printk_fmt) \ + trace_printk_fmt = fmt; \ + \ __ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs); \ } while (0) -- cgit v1.2.3-59-g8ed1b From c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 9 Mar 2009 18:15:34 +0900 Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer Impact: improve workqueue tracer output Currently, /sys/kernel/debug/tracing/trace_stat/workqueues can display wrong and strange thread names. Why? Currently, ftrace has tracing_record_cmdline()/trace_find_cmdline() convenience function that implements a task->comm string cache. This can avoid unnecessary memcpy overhead and the workqueue tracer uses it. However, in general, any trace statistics feature shouldn't use tracing_record_cmdline() because trace statistics can display very old process. Then comm cache can return wrong string because recent process overrides the cache. Fortunately, workqueue trace guarantees that displayed processes are live. Thus we can search comm string from PID at display time. % cat workqueues # CPU INSERTED EXECUTED NAME # | | | | 7 431913 431913 kondemand/7 7 0 0 tail 7 21 21 git 7 0 0 ls 7 9 9 cat 7 832632 832632 unix_chkpwd 7 236292 236292 ls Note: tail, git, ls, cat unix_chkpwd are obiously not workqueue thread. % cat workqueues # CPU INSERTED EXECUTED NAME # | | | | 7 510 510 kondemand/7 7 0 0 kmpathd/7 7 15 15 ata/7 7 0 0 aio/7 7 11 11 kblockd/7 7 1063 1063 work_on_cpu/7 7 167 167 events/7 Signed-off-by: KOSAKI Motohiro Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 4664990fe9c5..46c8dc896bd3 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -99,8 +99,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) pr_warning("trace_workqueue: not enough memory\n"); return; } - tracing_record_cmdline(wq_thread); - INIT_LIST_HEAD(&cws->list); cws->cpu = cpu; @@ -195,11 +193,12 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct cpu_workqueue_stats *cws = p; unsigned long flags; int cpu = cws->cpu; + struct task_struct *tsk = find_task_by_vpid(cws->pid); seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, atomic_read(&cws->inserted), cws->executed, - trace_find_cmdline(cws->pid)); + tsk ? tsk->comm : "<...>"); spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (&cws->list == workqueue_cpu_stat(cpu)->list.next) -- cgit v1.2.3-59-g8ed1b From 156b5f172a64103bcb13b6d26288388b9019caa3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 6 Mar 2009 10:50:53 -0500 Subject: tracing: typecast sizeof and offsetof to unsigned int Impact: fix compiler warnings On x86_64 sizeof and offsetof are treated as long, where as on x86_32 they are int. This patch typecasts them to unsigned int to avoid one arch giving warnings while the other does not. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 15 ++++++++------- kernel/trace/trace_export.c | 10 +++++----- kernel/trace/trace_format.h | 12 ++++++------ 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4488d90e75ef..fa32ca320767 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -448,8 +448,9 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt, } #undef FIELD -#define FIELD(type, name) \ - #type, #name, offsetof(typeof(field), name), sizeof(field.name) +#define FIELD(type, name) \ + #type, #name, (unsigned int)offsetof(typeof(field), name), \ + (unsigned int)sizeof(field.name) static int trace_write_header(struct trace_seq *s) { @@ -457,11 +458,11 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" - "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" - "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" - "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" - "\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n" + "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" + "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" + "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" + "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" + "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" "\n", FIELD(unsigned char, type), FIELD(unsigned char, flags), diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 0fb7be73e31c..7162ab49d05d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,11 +18,11 @@ #include "trace_format.h" #undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ - "offset:%lu;\tsize:0;\n", \ - offsetof(typeof(field), item)); \ - if (!ret) \ +#define TRACE_FIELD_ZERO_CHAR(item) \ + ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ + "offset:%u;\tsize:0;\n", \ + (unsigned int)offsetof(typeof(field), item)); \ + if (!ret) \ return 0; diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h index 03f9a4c165ca..97e59a9c82ea 100644 --- a/kernel/trace/trace_format.h +++ b/kernel/trace/trace_format.h @@ -22,9 +22,9 @@ #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%lu;\tsize:%lu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ if (!ret) \ return 0; @@ -32,9 +32,9 @@ #undef TRACE_FIELD_SPECIAL #define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%lu;\tsize:%lu;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ if (!ret) \ return 0; -- cgit v1.2.3-59-g8ed1b From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 15:47:18 -0400 Subject: tracing: replace TP with TP_ Impact: clean up The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit ugly. This patch adds an underscore to their names. Signed-off-by: Steven Rostedt --- Documentation/tracepoints.txt | 8 +-- include/linux/tracepoint.h | 6 +-- include/trace/block.h | 70 ++++++++++++------------ include/trace/irq_event_types.h | 16 +++--- include/trace/lockdep_event_types.h | 24 ++++----- include/trace/power.h | 12 ++--- include/trace/sched_event_types.h | 98 +++++++++++++++++----------------- include/trace/workqueue.h | 16 +++--- kernel/trace/trace_event_types.h | 28 +++++----- kernel/trace/trace_events_stage_2.h | 6 +-- kernel/trace/trace_events_stage_3.h | 8 +-- kernel/trace/trace_export.c | 8 +-- kernel/trace/trace_format.h | 55 ------------------- samples/tracepoints/tp-samples-trace.h | 8 +-- 14 files changed, 154 insertions(+), 209 deletions(-) delete mode 100644 kernel/trace/trace_format.h diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 6f0a044f5b5e..4ff43c6de299 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -45,8 +45,8 @@ In include/trace/subsys.h : #include DECLARE_TRACE(subsys_eventname, - TPPROTO(int firstarg, struct task_struct *p), - TPARGS(firstarg, p)); + TP_PROTO(int firstarg, struct task_struct *p), + TP_ARGS(firstarg, p)); In subsys/file.c (where the tracing statement must be added) : @@ -66,10 +66,10 @@ Where : - subsys is the name of your subsystem. - eventname is the name of the event to trace. -- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the +- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the function called by this tracepoint. -- TPARGS(firstarg, p) are the parameters names, same as found in the +- TP_ARGS(firstarg, p) are the parameters names, same as found in the prototype. Connecting a function (probe) to a tracepoint is done by providing a diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 152b2f03fb86..3bcc3e171443 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -31,8 +31,8 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ -#define TPPROTO(args...) args -#define TPARGS(args...) args +#define TP_PROTO(args...) args +#define TP_ARGS(args...) args #ifdef CONFIG_TRACEPOINTS @@ -65,7 +65,7 @@ struct tracepoint { { \ if (unlikely(__tracepoint_##name.state)) \ __DO_TRACE(&__tracepoint_##name, \ - TPPROTO(proto), TPARGS(args)); \ + TP_PROTO(proto), TP_ARGS(args)); \ } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ diff --git a/include/trace/block.h b/include/trace/block.h index 25c6a1fd5b77..25b7068b819e 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -5,72 +5,72 @@ #include DECLARE_TRACE(block_rq_abort, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_insert, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_issue, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_requeue, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_rq_complete, - TPPROTO(struct request_queue *q, struct request *rq), - TPARGS(q, rq)); + TP_PROTO(struct request_queue *q, struct request *rq), + TP_ARGS(q, rq)); DECLARE_TRACE(block_bio_bounce, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_complete, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_backmerge, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_frontmerge, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_bio_queue, - TPPROTO(struct request_queue *q, struct bio *bio), - TPARGS(q, bio)); + TP_PROTO(struct request_queue *q, struct bio *bio), + TP_ARGS(q, bio)); DECLARE_TRACE(block_getrq, - TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + TP_ARGS(q, bio, rw)); DECLARE_TRACE(block_sleeprq, - TPPROTO(struct request_queue *q, struct bio *bio, int rw), - TPARGS(q, bio, rw)); + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + TP_ARGS(q, bio, rw)); DECLARE_TRACE(block_plug, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_unplug_timer, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_unplug_io, - TPPROTO(struct request_queue *q), - TPARGS(q)); + TP_PROTO(struct request_queue *q), + TP_ARGS(q)); DECLARE_TRACE(block_split, - TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), - TPARGS(q, bio, pdu)); + TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), + TP_ARGS(q, bio, pdu)); DECLARE_TRACE(block_remap, - TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from, sector_t to), - TPARGS(q, bio, dev, from, to)); + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from, sector_t to), + TP_ARGS(q, bio, dev, from, to)); #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 65850bc5ea06..0147d9eef5f4 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -9,25 +9,25 @@ #define TRACE_SYSTEM irq TRACE_EVENT_FORMAT(irq_handler_entry, - TPPROTO(int irq, struct irqaction *action), - TPARGS(irq, action), - TPFMT("irq=%d handler=%s", irq, action->name), + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name), TRACE_STRUCT( TRACE_FIELD(int, irq, irq) ), - TPRAWFMT("irq %d") + TP_RAW_FMT("irq %d") ); TRACE_EVENT_FORMAT(irq_handler_exit, - TPPROTO(int irq, struct irqaction *action, int ret), - TPARGS(irq, action, ret), - TPFMT("irq=%d handler=%s return=%s", + TP_PROTO(int irq, struct irqaction *action, int ret), + TP_ARGS(irq, action, ret), + TP_FMT("irq=%d handler=%s return=%s", irq, action->name, ret ? "handled" : "unhandled"), TRACE_STRUCT( TRACE_FIELD(int, irq, irq) TRACE_FIELD(int, ret, ret) ), - TPRAWFMT("irq %d ret %d") + TP_RAW_FMT("irq %d ret %d") ); #undef TRACE_SYSTEM diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index f713d74a82b4..1f00e8b3543e 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -10,32 +10,32 @@ #ifdef CONFIG_LOCKDEP TRACE_FORMAT(lock_acquire, - TPPROTO(struct lockdep_map *lock, unsigned int subclass, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), - TPARGS(lock, subclass, trylock, read, check, next_lock, ip), - TPFMT("%s%s%s", trylock ? "try " : "", + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", read ? "read " : "", lock->name) ); TRACE_FORMAT(lock_release, - TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TPARGS(lock, nested, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) ); #ifdef CONFIG_LOCK_STAT TRACE_FORMAT(lock_contended, - TPPROTO(struct lockdep_map *lock, unsigned long ip), - TPARGS(lock, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) ); TRACE_FORMAT(lock_acquired, - TPPROTO(struct lockdep_map *lock, unsigned long ip), - TPARGS(lock, ip), - TPFMT("%s", lock->name) + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) ); #endif diff --git a/include/trace/power.h b/include/trace/power.h index 38aca537e497..ef204666e983 100644 --- a/include/trace/power.h +++ b/include/trace/power.h @@ -18,15 +18,15 @@ struct power_trace { }; DECLARE_TRACE(power_start, - TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), - TPARGS(it, type, state)); + TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), + TP_ARGS(it, type, state)); DECLARE_TRACE(power_mark, - TPPROTO(struct power_trace *it, unsigned int type, unsigned int state), - TPARGS(it, type, state)); + TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state), + TP_ARGS(it, type, state)); DECLARE_TRACE(power_end, - TPPROTO(struct power_trace *it), - TPARGS(it)); + TP_PROTO(struct power_trace *it), + TP_ARGS(it)); #endif /* _TRACE_POWER_H */ diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index a6de5c1601a0..71b14828a957 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -9,143 +9,143 @@ #define TRACE_SYSTEM sched TRACE_EVENT_FORMAT(sched_kthread_stop, - TPPROTO(struct task_struct *t), - TPARGS(t), - TPFMT("task %s:%d", t->comm, t->pid), + TP_PROTO(struct task_struct *t), + TP_ARGS(t), + TP_FMT("task %s:%d", t->comm, t->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, t->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_kthread_stop_ret, - TPPROTO(int ret), - TPARGS(ret), - TPFMT("ret=%d", ret), + TP_PROTO(int ret), + TP_ARGS(ret), + TP_FMT("ret=%d", ret), TRACE_STRUCT( TRACE_FIELD(int, ret, ret) ), - TPRAWFMT("ret=%d") + TP_RAW_FMT("ret=%d") ); TRACE_EVENT_FORMAT(sched_wait_task, - TPPROTO(struct rq *rq, struct task_struct *p), - TPARGS(rq, p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct rq *rq, struct task_struct *p), + TP_ARGS(rq, p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_wakeup, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success), - TPFMT("task %s:%d %s", + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), + TP_FMT("task %s:%d %s", p->comm, p->pid, success ? "succeeded" : "failed"), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, success, success) ), - TPRAWFMT("task %d success=%d") + TP_RAW_FMT("task %d success=%d") ); TRACE_EVENT_FORMAT(sched_wakeup_new, - TPPROTO(struct rq *rq, struct task_struct *p, int success), - TPARGS(rq, p, success), - TPFMT("task %s:%d", + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), + TP_FMT("task %s:%d", p->comm, p->pid, success ? "succeeded" : "failed"), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, success, success) ), - TPRAWFMT("task %d success=%d") + TP_RAW_FMT("task %d success=%d") ); TRACE_EVENT_FORMAT(sched_switch, - TPPROTO(struct rq *rq, struct task_struct *prev, + TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), - TPARGS(rq, prev, next), - TPFMT("task %s:%d ==> %s:%d", + TP_ARGS(rq, prev, next), + TP_FMT("task %s:%d ==> %s:%d", prev->comm, prev->pid, next->comm, next->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, prev_pid, prev->pid) TRACE_FIELD(int, prev_prio, prev->prio) TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], next_comm, - TPCMD(memcpy(TRACE_ENTRY->next_comm, + TP_CMD(memcpy(TRACE_ENTRY->next_comm, next->comm, TASK_COMM_LEN))) TRACE_FIELD(pid_t, next_pid, next->pid) TRACE_FIELD(int, next_prio, next->prio) ), - TPRAWFMT("prev %d:%d ==> next %s:%d:%d") + TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d") ); TRACE_EVENT_FORMAT(sched_migrate_task, - TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - TPARGS(p, orig_cpu, dest_cpu), - TPFMT("task %s:%d from: %d to: %d", + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_ARGS(p, orig_cpu, dest_cpu), + TP_FMT("task %s:%d from: %d to: %d", p->comm, p->pid, orig_cpu, dest_cpu), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) TRACE_FIELD(int, orig_cpu, orig_cpu) TRACE_FIELD(int, dest_cpu, dest_cpu) ), - TPRAWFMT("task %d from: %d to: %d") + TP_RAW_FMT("task %d from: %d to: %d") ); TRACE_EVENT_FORMAT(sched_process_free, - TPPROTO(struct task_struct *p), - TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct task_struct *p), + TP_ARGS(p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_exit, - TPPROTO(struct task_struct *p), - TPARGS(p), - TPFMT("task %s:%d", p->comm, p->pid), + TP_PROTO(struct task_struct *p), + TP_ARGS(p), + TP_FMT("task %s:%d", p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_wait, - TPPROTO(struct pid *pid), - TPARGS(pid), - TPFMT("pid %d", pid_nr(pid)), + TP_PROTO(struct pid *pid), + TP_ARGS(pid), + TP_FMT("pid %d", pid_nr(pid)), TRACE_STRUCT( TRACE_FIELD(pid_t, pid, pid_nr(pid)) ), - TPRAWFMT("task %d") + TP_RAW_FMT("task %d") ); TRACE_EVENT_FORMAT(sched_process_fork, - TPPROTO(struct task_struct *parent, struct task_struct *child), - TPARGS(parent, child), - TPFMT("parent %s:%d child %s:%d", + TP_PROTO(struct task_struct *parent, struct task_struct *child), + TP_ARGS(parent, child), + TP_FMT("parent %s:%d child %s:%d", parent->comm, parent->pid, child->comm, child->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, parent, parent->pid) TRACE_FIELD(pid_t, child, child->pid) ), - TPRAWFMT("parent %d child %d") + TP_RAW_FMT("parent %d child %d") ); TRACE_EVENT_FORMAT(sched_signal_send, - TPPROTO(int sig, struct task_struct *p), - TPARGS(sig, p), - TPFMT("sig: %d task %s:%d", sig, p->comm, p->pid), + TP_PROTO(int sig, struct task_struct *p), + TP_ARGS(sig, p), + TP_FMT("sig: %d task %s:%d", sig, p->comm, p->pid), TRACE_STRUCT( TRACE_FIELD(int, sig, sig) TRACE_FIELD(pid_t, pid, p->pid) ), - TPRAWFMT("sig: %d task %d") + TP_RAW_FMT("sig: %d task %d") ); #undef TRACE_SYSTEM diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h index 867829df4571..7626523deeba 100644 --- a/include/trace/workqueue.h +++ b/include/trace/workqueue.h @@ -6,20 +6,20 @@ #include DECLARE_TRACE(workqueue_insertion, - TPPROTO(struct task_struct *wq_thread, struct work_struct *work), - TPARGS(wq_thread, work)); + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + TP_ARGS(wq_thread, work)); DECLARE_TRACE(workqueue_execution, - TPPROTO(struct task_struct *wq_thread, struct work_struct *work), - TPARGS(wq_thread, work)); + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + TP_ARGS(wq_thread, work)); /* Trace the creation of one workqueue thread on a cpu */ DECLARE_TRACE(workqueue_creation, - TPPROTO(struct task_struct *wq_thread, int cpu), - TPARGS(wq_thread, cpu)); + TP_PROTO(struct task_struct *wq_thread, int cpu), + TP_ARGS(wq_thread, cpu)); DECLARE_TRACE(workqueue_destruction, - TPPROTO(struct task_struct *wq_thread), - TPARGS(wq_thread)); + TP_PROTO(struct task_struct *wq_thread), + TP_ARGS(wq_thread)); #endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fb4eba166433..d94179aa1fc2 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned long, parent_ip, parent_ip) ), - TPRAWFMT(" %lx <-- %lx") + TP_RAW_FMT(" %lx <-- %lx") ); TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, @@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, TRACE_FIELD(unsigned long, graph_ent.func, func) TRACE_FIELD(int, graph_ent.depth, depth) ), - TPRAWFMT("--> %lx (%d)") + TP_RAW_FMT("--> %lx (%d)") ); TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, @@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, TRACE_FIELD(unsigned long, ret.func, func) TRACE_FIELD(int, ret.depth, depth) ), - TPRAWFMT("<-- %lx (%d)") + TP_RAW_FMT("<-- %lx (%d)") ); TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, @@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, TRACE_FIELD(unsigned char, next_state, next_state) TRACE_FIELD(unsigned int, next_cpu, next_cpu) ), - TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") + TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, @@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TRACE_FIELD(unsigned char, next_state, next_state) TRACE_FIELD(unsigned int, next_cpu, next_cpu) ), - TPRAWFMT("%u:%u:%u ==+ %u:%u:%u [%03u]") + TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, @@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, TRACE_FIELD(unsigned long, arg2, arg2) TRACE_FIELD(unsigned long, arg3, arg3) ), - TPRAWFMT("(%08lx) (%08lx) (%08lx)") + TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") ); /* @@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, TRACE_FIELD(unsigned long, caller[6], stack6) TRACE_FIELD(unsigned long, caller[7], stack7) ), - TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); @@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, TRACE_FIELD(unsigned long, caller[6], stack6) TRACE_FIELD(unsigned long, caller[7], stack7) ), - TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" + TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); @@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD_ZERO_CHAR(buf) ), - TPRAWFMT("%08lx (%d) %s") + TP_RAW_FMT("%08lx (%d) %s") ); TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, @@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) TRACE_FIELD(char, correct, correct) ), - TPRAWFMT("%u:%s:%s (%u)") + TP_RAW_FMT("%u:%s:%s (%u)") ); TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, @@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_FIELD(u64, from, from) TRACE_FIELD(u64, to, to) ), - TPRAWFMT("from: %llx to: %llx") + TP_RAW_FMT("from: %llx to: %llx") ); TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, @@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), - TPRAWFMT("%llx->%llx type:%u state:%u") + TP_RAW_FMT("%llx->%llx type:%u state:%u") ); TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, @@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) TRACE_FIELD(int, node, node) ), - TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" + TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" " flags:%x node:%d") ); @@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, TRACE_FIELD(unsigned long, call_site, call_site) TRACE_FIELD(const void *, ptr, ptr) ), - TPRAWFMT("type:%u call_site:%lx ptr:%p") + TP_RAW_FMT("type:%u call_site:%lx ptr:%p") ); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index d24a97e74aea..8e2e0f56c2a8 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -20,7 +20,7 @@ * * field = (typeof(field))entry; * - * ret = trace_seq_printf(s, "%s", "\n"); + * ret = trace_seq_printf(s, "%s", "\n"); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -44,8 +44,8 @@ field->item, -#undef TPRAWFMT -#define TPRAWFMT(args...) args +#undef TP_RAW_FMT +#define TP_RAW_FMT(args...) args #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 2c8d76c7dbed..557ca52bcdcd 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -106,8 +106,8 @@ * */ -#undef TPFMT -#define TPFMT(fmt, args...) fmt "\n", ##args +#undef TP_FMT +#define TP_FMT(fmt, args...) fmt "\n", ##args #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ @@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; -#undef TPCMD -#define TPCMD(cmd...) cmd +#undef TP_CMD +#define TP_CMD(cmd...) cmd #undef TRACE_ENTRY #define TRACE_ENTRY entry diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7162ab49d05d..e62bc10f8103 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -26,8 +26,8 @@ return 0; -#undef TPRAWFMT -#define TPRAWFMT(args...) args +#undef TP_RAW_FMT +#define TP_RAW_FMT(args...) args #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ @@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; -#undef TPCMD -#define TPCMD(cmd...) cmd +#undef TP_CMD +#define TP_CMD(cmd...) cmd #undef TRACE_ENTRY #define TRACE_ENTRY entry diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h deleted file mode 100644 index 97e59a9c82ea..000000000000 --- a/kernel/trace/trace_format.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " size:%d; offset:%d;\n", - * sizeof(field.type), - * offsetof(struct ftrace_raw_##call, - * item)); - * - * } - */ - -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ - \ - return ret; \ -} - diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h index 01724e04c556..dffdc49878af 100644 --- a/samples/tracepoints/tp-samples-trace.h +++ b/samples/tracepoints/tp-samples-trace.h @@ -5,9 +5,9 @@ #include DECLARE_TRACE(subsys_event, - TPPROTO(struct inode *inode, struct file *file), - TPARGS(inode, file)); + TP_PROTO(struct inode *inode, struct file *file), + TP_ARGS(inode, file)); DECLARE_TRACE(subsys_eventb, - TPPROTO(void), - TPARGS()); + TP_PROTO(void), + TP_ARGS()); #endif -- cgit v1.2.3-59-g8ed1b From 9cc26a261d43e5898287a1f5808132f8f05ceb1c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 16:00:22 -0400 Subject: tracing: use generic __stringify Impact: clean up This removes the custom made STR(x) macros in the tracer and uses the generic __stringify macro instead. Signed-off-by: Steven Rostedt --- kernel/trace/events.c | 4 +--- kernel/trace/trace_events_stage_3.h | 4 ++-- kernel/trace/trace_selftest.c | 6 ++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/kernel/trace/events.c b/kernel/trace/events.c index f2509cbaacea..9fc918da404f 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -2,9 +2,7 @@ * This is the place to register all trace points as events. */ -/* someday this needs to go in a generic header */ -#define __STR(x) #x -#define STR(x) __STR(x) +#include #include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 557ca52bcdcd..41b82b93c9c7 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -139,7 +139,7 @@ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .system = STR(TRACE_SYSTEM), \ + .system = __stringify(TRACE_SYSTEM), \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ } @@ -225,7 +225,7 @@ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .system = STR(TRACE_SYSTEM), \ + .system = __stringify(TRACE_SYSTEM), \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ .raw_init = ftrace_raw_init_event_##call, \ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 7238646b8723..f907a2b29028 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1,5 +1,6 @@ /* Include in trace.c */ +#include #include #include @@ -100,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) #ifdef CONFIG_DYNAMIC_FTRACE -#define __STR(x) #x -#define STR(x) __STR(x) - /* Test dynamic code modification and ftrace filters */ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, @@ -130,7 +128,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, * start of the function names. We simply put a '*' to * accommodate them. */ - func_name = "*" STR(DYN_FTRACE_TEST_NAME); + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); /* filter only on our function */ ftrace_set_filter(func_name, strlen(func_name), 1); -- cgit v1.2.3-59-g8ed1b From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 17:14:30 -0400 Subject: tracing: new format for specialized trace points Impact: clean up and enhancement The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its ability to save data as well as to print the record out. Working with Ingo Molnar, we came up with a new format that is much more pleasing to the eye of C developers. This new macro is more C style than the old macro, and is more obvious to what it does. Here's the example. The only updated macro in this patch is the sched_switch trace point. The old method looked like this: TRACE_EVENT_FORMAT(sched_switch, TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TP_ARGS(rq, prev, next), TP_FMT("task %s:%d ==> %s:%d", prev->comm, prev->pid, next->comm, next->pid), TRACE_STRUCT( TRACE_FIELD(pid_t, prev_pid, prev->pid) TRACE_FIELD(int, prev_prio, prev->prio) TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], next_comm, TP_CMD(memcpy(TRACE_ENTRY->next_comm, next->comm, TASK_COMM_LEN))) TRACE_FIELD(pid_t, next_pid, next->pid) TRACE_FIELD(int, next_prio, next->prio) ), TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d") ); The above method is hard to read and requires two format fields. The new method: /* * Tracepoint for task switches, performed by the scheduler: * * (NOTE: the 'rq' argument is not used by generic trace events, * but used by the latency tracer plugin. ) */ TRACE_EVENT(sched_switch, TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next), TP_ARGS(rq, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) ), TP_printk("task %s:%d [%d] ==> %s:%d [%d]", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->next_comm, __entry->next_pid, __entry->next_prio), TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; ) ); This macro is called TRACE_EVENT, it is broken up into 5 parts: TP_PROTO: the proto type of the trace point TP_ARGS: the arguments of the trace point TP_STRUCT_entry: the structure layout of the entry in the ring buffer TP_printk: the printk format TP_fast_assign: the method used to write the entry into the ring buffer The structure is the definition of how the event will be saved in the ring buffer. The printk is used by the internal tracing in case of an oops, and the kernel needs to print out the format of the record to the console. This the TP_printk gives a means to show the records in a human readable format. It is also used to print out the data from the trace file. The TP_fast_assign is executed directly. It is basically like a C function, where the __entry is the handle to the record. Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 3 + include/trace/sched_event_types.h | 48 +++++++---- kernel/trace/trace.h | 5 -- kernel/trace/trace_event_types.h | 3 +- kernel/trace/trace_events.c | 159 +----------------------------------- kernel/trace/trace_events_stage_1.h | 28 ++++--- kernel/trace/trace_events_stage_2.h | 89 ++++++++++++++++---- kernel/trace/trace_events_stage_3.h | 34 +++----- kernel/trace/trace_export.c | 23 +++++- 9 files changed, 159 insertions(+), 233 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 3bcc3e171443..6b4f1bb3701e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -160,4 +160,7 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt) \ TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt)) +#define TRACE_EVENT(name, proto, args, struct, print, assign) \ + DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + #endif diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index 71b14828a957..aa77fb754038 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -62,25 +62,41 @@ TRACE_EVENT_FORMAT(sched_wakeup_new, TP_RAW_FMT("task %d success=%d") ); -TRACE_EVENT_FORMAT(sched_switch, +/* + * Tracepoint for task switches, performed by the scheduler: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_switch, + TP_PROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), + struct task_struct *next), + TP_ARGS(rq, prev, next), - TP_FMT("task %s:%d ==> %s:%d", - prev->comm, prev->pid, next->comm, next->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, prev_pid, prev->pid) - TRACE_FIELD(int, prev_prio, prev->prio) - TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN], - next_comm, - TP_CMD(memcpy(TRACE_ENTRY->next_comm, - next->comm, - TASK_COMM_LEN))) - TRACE_FIELD(pid_t, next_pid, next->pid) - TRACE_FIELD(int, next_prio, next->prio) + + TP_STRUCT__entry( + __array( char, prev_comm, TASK_COMM_LEN ) + __field( pid_t, prev_pid ) + __field( int, prev_prio ) + __array( char, next_comm, TASK_COMM_LEN ) + __field( pid_t, next_pid ) + __field( int, next_prio ) ), - TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d") - ); + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio), + + TP_fast_assign( + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_pid = prev->pid; + __entry->prev_prio = prev->prio; + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ) +); TRACE_EVENT_FORMAT(sched_migrate_task, TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2bfb7d11fc17..c5e1d8865fe4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -751,12 +751,7 @@ struct ftrace_event_call { int (*regfunc)(void); void (*unregfunc)(void); int id; - struct dentry *raw_dir; - int raw_enabled; - int type; int (*raw_init)(void); - int (*raw_reg)(void); - void (*raw_unreg)(void); int (*show_format)(struct trace_seq *s); }; diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index d94179aa1fc2..5cca4c978bde 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned int, depth, depth) + TRACE_FIELD(char *, fmt, fmt) TRACE_FIELD_ZERO_CHAR(buf) ), - TP_RAW_FMT("%08lx (%d) %s") + TP_RAW_FMT("%08lx (%d) fmt:%p %s") ); TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fa32ca320767..1880a6438097 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, call->enabled = 0; call->unregfunc(); } - if (call->raw_enabled) { - call->raw_enabled = 0; - call->raw_unreg(); - } break; case 1: - if (!call->enabled && - (call->type & TRACE_EVENT_TYPE_PRINTF)) { + if (!call->enabled) { call->enabled = 1; call->regfunc(); } - if (!call->raw_enabled && - (call->type & TRACE_EVENT_TYPE_RAW)) { - call->raw_enabled = 1; - call->raw_reg(); - } break; } } @@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call = filp->private_data; char *buf; - if (call->enabled || call->raw_enabled) + if (call->enabled) buf = "1\n"; else buf = "0\n"; @@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } -static ssize_t -event_type_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - char buf[16]; - int r = 0; - - if (call->type & TRACE_EVENT_TYPE_PRINTF) - r += sprintf(buf, "printf\n"); - - if (call->type & TRACE_EVENT_TYPE_RAW) - r += sprintf(buf+r, "raw\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -event_type_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - char buf[64]; - - /* - * If there's only one type, we can't change it. - * And currently we always have printf type, and we - * may or may not have raw type. - * - * This is a redundant check, the file should be read - * only if this is the case anyway. - */ - - if (!call->raw_init) - return -EPERM; - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - if (!strncmp(buf, "printf", 6) && - (!buf[6] || isspace(buf[6]))) { - - call->type = TRACE_EVENT_TYPE_PRINTF; - - /* - * If raw enabled, the disable it and enable - * printf type. - */ - if (call->raw_enabled) { - call->raw_enabled = 0; - call->raw_unreg(); - - call->enabled = 1; - call->regfunc(); - } - - } else if (!strncmp(buf, "raw", 3) && - (!buf[3] || isspace(buf[3]))) { - - call->type = TRACE_EVENT_TYPE_RAW; - - /* - * If printf enabled, the disable it and enable - * raw type. - */ - if (call->enabled) { - call->enabled = 0; - call->unregfunc(); - - call->raw_enabled = 1; - call->raw_reg(); - } - } else - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static ssize_t -event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct ftrace_event_call *call = filp->private_data; - char buf[16]; - int r = 0; - - r += sprintf(buf, "printf\n"); - - if (call->raw_init) - r += sprintf(buf+r, "raw\n"); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - #undef FIELD #define FIELD(type, name) \ #type, #name, (unsigned int)offsetof(typeof(field), name), \ @@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s) FIELD(int, pid), FIELD(int, tgid)); } + static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = t_stop, }; -static const struct file_operations ftrace_avail_fops = { - .open = ftrace_event_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_seq_open, .read = seq_read, @@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = { .write = event_enable_write, }; -static const struct file_operations ftrace_type_fops = { - .open = tracing_open_generic, - .read = event_type_read, - .write = event_type_write, -}; - -static const struct file_operations ftrace_available_types_fops = { - .open = tracing_open_generic, - .read = event_available_types_read, -}; - static const struct file_operations ftrace_event_format_fops = { .open = tracing_open_generic, .read = event_format_read, @@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) } } - /* default the output to printf */ - call->type = TRACE_EVENT_TYPE_PRINTF; - call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { pr_warning("Could not create debugfs " @@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) "'%s/enable' entry\n", call->name); } - /* Only let type be writable, if we can change it */ - entry = debugfs_create_file("type", - call->raw_init ? 0644 : 0444, - call->dir, call, - &ftrace_type_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/type' entry\n", call->name); - - entry = debugfs_create_file("available_types", 0444, call->dir, call, - &ftrace_available_types_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/available_types' entry\n", call->name); - /* A trace may not want to export its format */ if (!call->show_format) return 0; @@ -704,13 +558,6 @@ static __init int event_trace_init(void) if (!d_tracer) return 0; - entry = debugfs_create_file("available_events", 0444, d_tracer, - (void *)&show_event_seq_ops, - &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_events' entry\n"); - entry = debugfs_create_file("set_event", 0644, d_tracer, (void *)&show_set_event_seq_ops, &ftrace_set_event_fops); diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 3830a731424c..edfcbd3a0d1b 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -18,19 +18,23 @@ #define TRACE_FORMAT(call, proto, args, fmt) #undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name +#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt) + +#undef __array +#define __array(type, item, len) type item[len]; -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +#undef __field +#define __field(type, item) type item; -#define TRACE_FIELD(type, item, assign) \ - type item; -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - type_item; +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, print, assign) \ + struct ftrace_raw_##name { \ + struct trace_entry ent; \ + tstruct \ + }; \ + static struct ftrace_event_call event_##name #include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 8e2e0f56c2a8..d91bf4c56661 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -32,23 +32,14 @@ * in binary. */ -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +#undef __entry +#define __entry field -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign) \ - field->item, +#undef TP_printk +#define TP_printk(fmt, args...) fmt "\n", args -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - field->item, - - -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, print, assign) \ enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ { \ @@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ \ field = (typeof(field))entry; \ \ - ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n"); \ + ret = trace_seq_printf(s, print); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ return TRACE_TYPE_HANDLED; \ } - + #include -#include "trace_format.h" +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " size:%d; offset:%d;\n", + * sizeof(field.type), + * offsetof(struct ftrace_raw_##call, + * item)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __entry +#define __entry "REC" + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, print, func) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + #include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 41b82b93c9c7..8e398d864096 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .unregfunc = ftrace_unreg_event_##call, \ } -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ - entry->item = assign; - -#undef TP_CMD -#define TP_CMD(cmd...) cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY entry +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw) \ + TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ - cmd; +#undef __entry +#define __entry entry -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, print, assign) \ \ static struct ftrace_event_call event_##call; \ \ @@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto) \ return; \ entry = ring_buffer_event_data(event); \ \ - tstruct; \ + assign; \ \ trace_current_buffer_unlock_commit(event, irq_flags, pc); \ } \ @@ -226,10 +214,8 @@ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ .raw_init = ftrace_raw_init_event_##call, \ - .raw_reg = ftrace_raw_reg_event_##call, \ - .raw_unreg = ftrace_raw_unreg_event_##call, \ + .regfunc = ftrace_raw_reg_event_##call, \ + .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e62bc10f8103..23ae78430d58 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,7 +15,28 @@ #include "trace_output.h" -#include "trace_format.h" + +#undef TRACE_STRUCT +#define TRACE_STRUCT(args...) args + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ + ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; #undef TRACE_FIELD_ZERO_CHAR #define TRACE_FIELD_ZERO_CHAR(item) \ -- cgit v1.2.3-59-g8ed1b From 12b5fdb8bbb2d2fc31746d7b672c12fd8897aa08 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 23:03:44 -0400 Subject: tracing: convert the sched trace points to the TRACE_EVENT macros Impact: enhancement This patch converts the rest of the sched trace points to use the new more powerful TRACE_EVENT macro. Signed-off-by: Steven Rostedt --- include/trace/sched_event_types.h | 322 +++++++++++++++++++++++++++++--------- 1 file changed, 246 insertions(+), 76 deletions(-) diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index aa77fb754038..0bbbf410e01f 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -8,59 +8,137 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched -TRACE_EVENT_FORMAT(sched_kthread_stop, +/* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +TRACE_EVENT(sched_kthread_stop, + TP_PROTO(struct task_struct *t), + TP_ARGS(t), - TP_FMT("task %s:%d", t->comm, t->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, t->pid) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) ), - TP_RAW_FMT("task %d") - ); -TRACE_EVENT_FORMAT(sched_kthread_stop_ret, + TP_printk("task %s:%d", __entry->comm, __entry->pid), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + ) +); + +/* + * Tracepoint for the return value of the kthread stopping: + */ +TRACE_EVENT(sched_kthread_stop_ret, + TP_PROTO(int ret), + TP_ARGS(ret), - TP_FMT("ret=%d", ret), - TRACE_STRUCT( - TRACE_FIELD(int, ret, ret) + + TP_STRUCT__entry( + __field( int, ret ) ), - TP_RAW_FMT("ret=%d") - ); -TRACE_EVENT_FORMAT(sched_wait_task, + TP_printk("ret %d", __entry->ret), + + TP_fast_assign( + __entry->ret = ret; + ) +); + +/* + * Tracepoint for waiting on task to unschedule: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wait_task, + TP_PROTO(struct rq *rq, struct task_struct *p), + TP_ARGS(rq, p), - TP_FMT("task %s:%d", p->comm, p->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) ), - TP_RAW_FMT("task %d") - ); -TRACE_EVENT_FORMAT(sched_wakeup, + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ) +); + +/* + * Tracepoint for waking up a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup, + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), - TP_FMT("task %s:%d %s", - p->comm, p->pid, success ? "succeeded" : "failed"), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) - TRACE_FIELD(int, success, success) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) ), - TP_RAW_FMT("task %d success=%d") - ); -TRACE_EVENT_FORMAT(sched_wakeup_new, + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ) +); + +/* + * Tracepoint for waking up a new task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup_new, + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + TP_ARGS(rq, p, success), - TP_FMT("task %s:%d", - p->comm, p->pid, success ? "succeeded" : "failed"), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) - TRACE_FIELD(int, success, success) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) ), - TP_RAW_FMT("task %d success=%d") - ); + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ) +); /* * Tracepoint for task switches, performed by the scheduler: @@ -98,70 +176,162 @@ TRACE_EVENT(sched_switch, ) ); -TRACE_EVENT_FORMAT(sched_migrate_task, +/* + * Tracepoint for a task being migrated: + */ +TRACE_EVENT(sched_migrate_task, + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_ARGS(p, orig_cpu, dest_cpu), - TP_FMT("task %s:%d from: %d to: %d", - p->comm, p->pid, orig_cpu, dest_cpu), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) - TRACE_FIELD(int, orig_cpu, orig_cpu) - TRACE_FIELD(int, dest_cpu, dest_cpu) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) ), - TP_RAW_FMT("task %d from: %d to: %d") - ); -TRACE_EVENT_FORMAT(sched_process_free, + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->orig_cpu = orig_cpu; + __entry->dest_cpu = dest_cpu; + ) +); + +/* + * Tracepoint for freeing a task: + */ +TRACE_EVENT(sched_process_free, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), - TP_FMT("task %s:%d", p->comm, p->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) ), - TP_RAW_FMT("task %d") - ); -TRACE_EVENT_FORMAT(sched_process_exit, + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ) +); + +/* + * Tracepoint for a task exiting: + */ +TRACE_EVENT(sched_process_exit, + TP_PROTO(struct task_struct *p), + TP_ARGS(p), - TP_FMT("task %s:%d", p->comm, p->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, p->pid) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) ), - TP_RAW_FMT("task %d") - ); -TRACE_EVENT_FORMAT(sched_process_wait, + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ) +); + +/* + * Tracepoint for a waiting task: + */ +TRACE_EVENT(sched_process_wait, + TP_PROTO(struct pid *pid), + TP_ARGS(pid), - TP_FMT("pid %d", pid_nr(pid)), - TRACE_STRUCT( - TRACE_FIELD(pid_t, pid, pid_nr(pid)) + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) ), - TP_RAW_FMT("task %d") - ); -TRACE_EVENT_FORMAT(sched_process_fork, + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->pid = pid_nr(pid); + __entry->prio = current->prio; + ) +); + +/* + * Tracepoint for do_fork: + */ +TRACE_EVENT(sched_process_fork, + TP_PROTO(struct task_struct *parent, struct task_struct *child), + TP_ARGS(parent, child), - TP_FMT("parent %s:%d child %s:%d", - parent->comm, parent->pid, child->comm, child->pid), - TRACE_STRUCT( - TRACE_FIELD(pid_t, parent, parent->pid) - TRACE_FIELD(pid_t, child, child->pid) + + TP_STRUCT__entry( + __array( char, parent_comm, TASK_COMM_LEN ) + __field( pid_t, parent_pid ) + __array( char, child_comm, TASK_COMM_LEN ) + __field( pid_t, child_pid ) ), - TP_RAW_FMT("parent %d child %d") - ); -TRACE_EVENT_FORMAT(sched_signal_send, + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid), + + TP_fast_assign( + memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __entry->parent_pid = parent->pid; + memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __entry->child_pid = child->pid; + ) +); + +/* + * Tracepoint for sending a signal: + */ +TRACE_EVENT(sched_signal_send, + TP_PROTO(int sig, struct task_struct *p), + TP_ARGS(sig, p), - TP_FMT("sig: %d task %s:%d", sig, p->comm, p->pid), - TRACE_STRUCT( - TRACE_FIELD(int, sig, sig) - TRACE_FIELD(pid_t, pid, p->pid) + + TP_STRUCT__entry( + __field( int, sig ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) ), - TP_RAW_FMT("sig: %d task %d") - ); + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->sig = sig; + ) +); #undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From d6e2ca4c05be6a5ab16030a9f227301bd6acc9f0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Mar 2009 23:23:30 -0400 Subject: tracing: convert irq trace points to new macros Impact: enhancement Converted the two irq trace point macros. The entry macro copies the name of the irq handler, thus it is better to simply use the TRACE_FORMAT macro which uses the trace_printk. The return of the handler does not need to record the name, thus the faster C style handler is more approriate. Signed-off-by: Steven Rostedt --- include/trace/irq_event_types.h | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 0147d9eef5f4..43bcb74dd49f 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -8,26 +8,36 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -TRACE_EVENT_FORMAT(irq_handler_entry, +/* + * Tracepoint for entry of interrupt handler: + */ +TRACE_FORMAT(irq_handler_entry, TP_PROTO(int irq, struct irqaction *action), TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name), - TRACE_STRUCT( - TRACE_FIELD(int, irq, irq) - ), - TP_RAW_FMT("irq %d") + TP_FMT("irq=%d handler=%s", irq, action->name) ); -TRACE_EVENT_FORMAT(irq_handler_exit, +/* + * Tracepoint for return of an interrupt handler: + */ +TRACE_EVENT(irq_handler_exit, + TP_PROTO(int irq, struct irqaction *action, int ret), + TP_ARGS(irq, action, ret), - TP_FMT("irq=%d handler=%s return=%s", - irq, action->name, ret ? "handled" : "unhandled"), - TRACE_STRUCT( - TRACE_FIELD(int, irq, irq) - TRACE_FIELD(int, ret, ret) + + TP_STRUCT__entry( + __field( int, irq ) + __field( int, ret ) ), - TP_RAW_FMT("irq %d ret %d") - ); + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled"), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ) +); #undef TRACE_SYSTEM -- cgit v1.2.3-59-g8ed1b From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 00:15:34 -0400 Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro Impact: clean up The TRACE_EVENT_FORMAT macro is no longer used by trace points and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should be used by them. Although the TRACE_EVENT_FORMAT macro is still used by the internal tracing utility, it should not be used in core kernel code. Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 3 --- include/trace/lockdep_event_types.h | 2 +- include/trace/sched_event_types.h | 2 +- kernel/trace/trace_events_stage_1.h | 3 --- kernel/trace/trace_events_stage_3.h | 6 +----- 5 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 6b4f1bb3701e..69b56988813d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -157,9 +157,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt) \ - TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt)) - #define TRACE_EVENT(name, proto, args, struct, print, assign) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index 1f00e8b3543e..adccfcd2ec8f 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -1,5 +1,5 @@ -#ifndef TRACE_EVENT_FORMAT +#ifndef TRACE_FORMAT # error Do not include this file directly. # error Unless you know what you are doing. #endif diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index 0bbbf410e01f..fb37af672c88 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -1,6 +1,6 @@ /* use instead */ -#ifndef TRACE_EVENT_FORMAT +#ifndef TRACE_EVENT # error Do not include this file directly. # error Unless you know what you are doing. #endif diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index edfcbd3a0d1b..15e9bf965a18 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -17,9 +17,6 @@ #undef TRACE_FORMAT #define TRACE_FORMAT(call, proto, args, fmt) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt) - #undef __array #define __array(type, item, len) type item[len]; diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 8e398d864096..3ba55d4ab073 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -35,7 +35,7 @@ * } * * - * For those macros defined with TRACE_EVENT_FORMAT: + * For those macros defined with TRACE_EVENT: * * static struct ftrace_event_call event_; * @@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .unregfunc = ftrace_unreg_event_##call, \ } -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw) \ - TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) - #undef __entry #define __entry entry -- cgit v1.2.3-59-g8ed1b From 631595fbf4aeac260e664a8a002897e4db6a50dd Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Mar 2009 13:57:10 +0900 Subject: doc: add trace_buf_size description to kernel-parameters.txt from early boot tracing view, trace_buf_size parameter is important. it should be documented. Signed-off-by: KOSAKI Motohiro LKML-Reference: <20090310135200.A48B.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 454f42b21f16..7643483bdd6a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2344,6 +2344,8 @@ and is between 256 and 4096 characters. It is defined in the file tp720= [HW,PS2] + trace_buf_size=nn[KMG] [ftrace] will set tracing buffer size. + trix= [HW,OSS] MediaTrix AudioTrix Pro Format: ,,,,,,,, -- cgit v1.2.3-59-g8ed1b From bbcd3063597a3824357cd83c501c2a2aa21ef37b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Mar 2009 10:49:53 +0900 Subject: tracing: Don't assume possible cpu list have continuous numbers "for (++cpu ; cpu < num_possible_cpus(); cpu++)" statement assumes possible cpus have continuous number - but that's a wrong assumption. Insted, cpumask_next() should be used. Signed-off-by: KOSAKI Motohiro Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20090310104437.A480.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 46c8dc896bd3..739fdacf873b 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -91,7 +91,7 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) struct cpu_workqueue_stats *cws; unsigned long flags; - WARN_ON(cpu < 0 || cpu >= num_possible_cpus()); + WARN_ON(cpu < 0); /* Workqueues are sometimes created in atomic context */ cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); @@ -175,12 +175,12 @@ static void *workqueue_stat_next(void *prev, int idx) spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - for (++cpu ; cpu < num_possible_cpus(); cpu++) { - ret = workqueue_stat_start_cpu(cpu); - if (ret) - return ret; - } - return NULL; + do { + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + return NULL; + } while (!(ret = workqueue_stat_start_cpu(cpu))); + return ret; } spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -- cgit v1.2.3-59-g8ed1b From 39e874f8afbdb3745e2406ce4ecbde9ac4cbaa78 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 9 Mar 2009 21:15:04 +0100 Subject: vsprintf: fix bug in negative value printing Sitsofe Wheeler found and bisected that while unifying the vsprintf format decoding in: fef20d9: vsprintf: unify the format decoding layer for its 3 users The sign flag has been dropped out in favour of precise types (ie: LONG/ULONG). But the format helper number() still needs this flag to keep track of the signedness unless it will consider all numbers as unsigned. Also add an explicit cast to int (for %d) while parsing with va_arg() to ensure the highest bit is well extended on the 64 bits number that hosts the value in case of negative values. Reported-Bisected-Tested-by: Sitsofe Wheeler Signed-off-by: Frederic Weisbecker Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Linus Torvalds LKML-Reference: <20090309201503.GA5010@nowhere> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 25f01578c856..dc1674377009 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -768,7 +768,6 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, static int format_decode(const char *fmt, struct printf_spec *spec) { const char *start = fmt; - bool sign = false; /* we finished early by reading the field width */ if (spec->type == FORMAT_TYPE_WITDH) { @@ -900,7 +899,7 @@ qualifier: case 'd': case 'i': - sign = true; + spec->flags |= SIGN; case 'u': break; @@ -912,7 +911,7 @@ qualifier: if (spec->qualifier == 'L') spec->type = FORMAT_TYPE_LONG_LONG; else if (spec->qualifier == 'l') { - if (sign) + if (spec->flags & SIGN) spec->type = FORMAT_TYPE_LONG; else spec->type = FORMAT_TYPE_ULONG; @@ -921,12 +920,12 @@ qualifier: } else if (spec->qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; } else if (spec->qualifier == 'h') { - if (sign) + if (spec->flags & SIGN) spec->type = FORMAT_TYPE_SHORT; else spec->type = FORMAT_TYPE_USHORT; } else { - if (sign) + if (spec->flags & SIGN) spec->type = FORMAT_TYPE_INT; else spec->type = FORMAT_TYPE_UINT; @@ -1101,8 +1100,8 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) case FORMAT_TYPE_SHORT: num = (short) va_arg(args, int); break; - case FORMAT_TYPE_UINT: - num = va_arg(args, unsigned int); + case FORMAT_TYPE_INT: + num = (int) va_arg(args, int); break; default: num = va_arg(args, unsigned int); -- cgit v1.2.3-59-g8ed1b From bbb76b552a9cef86777181c8577acc907b122b41 Mon Sep 17 00:00:00 2001 From: Américo Wang Date: Tue, 10 Mar 2009 17:34:47 +0800 Subject: ptrace: remove a useless goto Impact: cleanup Obviously, this goto is useless. Remove it. Signed-off-by: WANG Cong Cc: Andrew Morton Cc: Roland McGrath LKML-Reference: <20090310093447.GC3179@hack> Signed-off-by: Ingo Molnar --- kernel/ptrace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c9cf48b21f05..137955913e7e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -612,8 +612,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); - if (ret < 0) - goto out_put_task_struct; out_put_task_struct: put_task_struct(child); -- cgit v1.2.3-59-g8ed1b From ce8eb2bf05042452107e489782105d2e235cbdd0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 10:14:35 -0400 Subject: tracing: fix printk format specifier Impact: clean up The offsetof and sizeof are of type size_t, and instead of typecasting them to unsigned int for printk formatting, one could just use %zu. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1880a6438097..a0b41cc26f26 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -338,8 +338,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, #undef FIELD #define FIELD(type, name) \ - #type, #name, (unsigned int)offsetof(typeof(field), name), \ - (unsigned int)sizeof(field.name) + #type, #name, offsetof(typeof(field), name), sizeof(field.name) static int trace_write_header(struct trace_seq *s) { @@ -347,11 +346,11 @@ static int trace_write_header(struct trace_seq *s) /* struct trace_entry */ return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" - "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" - "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" - "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" - "\tfield:%s %s;\toffset:%u;\tsize:%u;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" + "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", FIELD(unsigned char, type), FIELD(unsigned char, flags), -- cgit v1.2.3-59-g8ed1b From 7cf49427042400d40bdc80b5c3399b6b5945afa8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 9 Mar 2009 12:40:40 -0400 Subject: x86: expand irq-off region in text_poke() Expand irq-off region to cover fixmap using code and cache synchronizing. Signed-off-by: Masami Hiramatsu LKML-Reference: <49B54688.8090403@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2d903b760ddb..f57658702571 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -526,13 +526,12 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) pages[1] = virt_to_page(addr + PAGE_SIZE); } BUG_ON(!pages[0]); + local_irq_save(flags); set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); if (pages[1]) set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - local_irq_save(flags); memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_restore(flags); clear_fixmap(FIX_TEXT_POKE0); if (pages[1]) clear_fixmap(FIX_TEXT_POKE1); @@ -542,5 +541,6 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) that causes hangs on some VIA CPUs. */ for (i = 0; i < len; i++) BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); + local_irq_restore(flags); return addr; } -- cgit v1.2.3-59-g8ed1b From 40e26815fafd3b8c4aced17b1f22e68ef33eb8db Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 11:32:40 -0400 Subject: tracing: do not allow modifying the ftrace events via the event files Impact: fix to prevent crash on calling NULL function pointer The ftrace internal records have their format exported via the event system under the ftrace subsystem. These are only for exporting the format to allow binary readers to be able to parse them in a binary output. The ftrace subsystem events can only be enabled via the ftrace tracers and do not have a registering function. The event files expect the event record to have registering function and will call it directly. Passing in a ftrace subsystem event will cause the kernel to crash because it will execute a NULL pointer. This patch prevents the ftrace subsystem from being viewable to the event enabling files. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a0b41cc26f26..85ec10fbb38d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -102,7 +102,7 @@ static int ftrace_set_clr_event(char *buf, int set) mutex_lock(&event_mutex); events_for_each(call) { - if (!call->name) + if (!call->name || !call->regfunc) continue; if (match && @@ -207,8 +207,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) - return NULL; + for (;;) { + if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + return NULL; + + /* + * The ftrace subsystem is for showing formats only. + * They can not be enabled or disabled via the event files. + */ + if (call->regfunc) + break; + + call++; + next = call; + } m->private = ++next; -- cgit v1.2.3-59-g8ed1b From 2314c4ae1461c9e8b26cf8b9a851f280bc5769e1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 12:04:02 -0400 Subject: tracing: add back the available_events file The event directory files type and available_types were no longer needed with the new TRACE_EVENT_FORMAT macros, they were deleted. But by accident the available_events file was also removed. This patch brings it back. Reported-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 85ec10fbb38d..769dfd00fc85 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -428,6 +428,13 @@ static const struct seq_operations show_set_event_seq_ops = { .stop = t_stop, }; +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static const struct file_operations ftrace_set_event_fops = { .open = ftrace_event_seq_open, .read = seq_read, @@ -569,6 +576,13 @@ static __init int event_trace_init(void) if (!d_tracer) return 0; + entry = debugfs_create_file("available_events", 0444, d_tracer, + (void *)&show_event_seq_ops, + &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_events' entry\n"); + entry = debugfs_create_file("set_event", 0644, d_tracer, (void *)&show_set_event_seq_ops, &ftrace_set_event_fops); -- cgit v1.2.3-59-g8ed1b From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 12:41:38 -0400 Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT macro Impact: clean up In trying to stay consistant with the C style format in the TRACE_EVENT macro, it makes more sense to do the printk after the assigning of the variables. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 2 +- include/trace/irq_event_types.h | 8 +-- include/trace/sched_event_types.h | 102 ++++++++++++++++++------------------ kernel/trace/trace_events_stage_1.h | 2 +- kernel/trace/trace_events_stage_2.h | 4 +- kernel/trace/trace_events_stage_3.h | 2 +- 6 files changed, 60 insertions(+), 60 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 69b56988813d..c7b09452514b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -157,7 +157,7 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#define TRACE_EVENT(name, proto, args, struct, print, assign) \ +#define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 43bcb74dd49f..214bb928fe9e 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -31,13 +31,13 @@ TRACE_EVENT(irq_handler_exit, __field( int, ret ) ), - TP_printk("irq=%d return=%s", - __entry->irq, __entry->ret ? "handled" : "unhandled"), - TP_fast_assign( __entry->irq = irq; __entry->ret = ret; - ) + ), + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled") ); #undef TRACE_SYSTEM diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h index fb37af672c88..63547dc1125f 100644 --- a/include/trace/sched_event_types.h +++ b/include/trace/sched_event_types.h @@ -22,12 +22,12 @@ TRACE_EVENT(sched_kthread_stop, __field( pid_t, pid ) ), - TP_printk("task %s:%d", __entry->comm, __entry->pid), - TP_fast_assign( memcpy(__entry->comm, t->comm, TASK_COMM_LEN); __entry->pid = t->pid; - ) + ), + + TP_printk("task %s:%d", __entry->comm, __entry->pid) ); /* @@ -43,11 +43,11 @@ TRACE_EVENT(sched_kthread_stop_ret, __field( int, ret ) ), - TP_printk("ret %d", __entry->ret), - TP_fast_assign( __entry->ret = ret; - ) + ), + + TP_printk("ret %d", __entry->ret) ); /* @@ -68,14 +68,14 @@ TRACE_EVENT(sched_wait_task, __field( int, prio ) ), - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - ) + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) ); /* @@ -97,16 +97,16 @@ TRACE_EVENT(sched_wakeup, __field( int, success ) ), - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->success = success; - ) + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) ); /* @@ -128,16 +128,16 @@ TRACE_EVENT(sched_wakeup_new, __field( int, success ) ), - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->success = success; - ) + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) ); /* @@ -162,10 +162,6 @@ TRACE_EVENT(sched_switch, __field( int, next_prio ) ), - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->next_comm, __entry->next_pid, __entry->next_prio), - TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; @@ -173,7 +169,11 @@ TRACE_EVENT(sched_switch, memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; - ) + ), + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio) ); /* @@ -193,17 +193,17 @@ TRACE_EVENT(sched_migrate_task, __field( int, dest_cpu ) ), - TP_printk("task %s:%d [%d] from: %d to: %d", - __entry->comm, __entry->pid, __entry->prio, - __entry->orig_cpu, __entry->dest_cpu), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; __entry->orig_cpu = orig_cpu; __entry->dest_cpu = dest_cpu; - ) + ), + + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu) ); /* @@ -221,14 +221,14 @@ TRACE_EVENT(sched_process_free, __field( int, prio ) ), - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - ) + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) ); /* @@ -246,14 +246,14 @@ TRACE_EVENT(sched_process_exit, __field( int, prio ) ), - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - ) + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) ); /* @@ -271,14 +271,14 @@ TRACE_EVENT(sched_process_wait, __field( int, prio ) ), - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio), - TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); __entry->prio = current->prio; - ) + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) ); /* @@ -297,16 +297,16 @@ TRACE_EVENT(sched_process_fork, __field( pid_t, child_pid ) ), - TP_printk("parent %s:%d child %s:%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid), - TP_fast_assign( memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); __entry->parent_pid = parent->pid; memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); __entry->child_pid = child->pid; - ) + ), + + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid) ); /* @@ -324,14 +324,14 @@ TRACE_EVENT(sched_signal_send, __field( pid_t, pid ) ), - TP_printk("sig: %d task %s:%d", - __entry->sig, __entry->comm, __entry->pid), - TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->sig = sig; - ) + ), + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid) ); #undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 15e9bf965a18..82f68443c556 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -27,7 +27,7 @@ #define TP_STRUCT__entry(args...) args #undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, print, assign) \ +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index d91bf4c56661..1ad9f8d2fe45 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -39,7 +39,7 @@ #define TP_printk(fmt, args...) fmt "\n", args #undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, print, assign) \ +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ { \ @@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #define TP_fast_assign(args...) args #undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, print, func) \ +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ ftrace_format_##call(struct trace_seq *s) \ { \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 3ba55d4ab073..d6de06b9201a 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define __entry entry #undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, print, assign) \ +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ static struct ftrace_event_call event_##call; \ \ -- cgit v1.2.3-59-g8ed1b From 823f9124fb2e33eeb624d139978a52089f8a02ae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 12:58:51 -0400 Subject: tracing: document TRACE_EVENT macro in tracepoint.h Impact: clean up / comments Kosaki Motohiro asked about an explanation to the TRACE_EVENT macro. Ingo Molnar replied with a nice description. This patch takes the description that Ingo wrote (with some slight modifications) and adds it to the tracepoint.h file. Reported-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 103 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c7b09452514b..119ece224c21 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -157,6 +157,109 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) + +/* + * For use with the TRACE_EVENT macro: + * + * We define a tracepoint, its arguments, its printk format + * and its 'fast binay record' layout. + * + * Firstly, name your tracepoint via TRACE_EVENT(name : the + * 'subsystem_event' notation is fine. + * + * Think about this whole construct as the + * 'trace_sched_switch() function' from now on. + * + * + * TRACE_EVENT(sched_switch, + * + * * + * * A function has a regular function arguments + * * prototype, declare it via TP_PROTO(): + * * + * + * TP_PROTO(struct rq *rq, struct task_struct *prev, + * struct task_struct *next), + * + * * + * * Define the call signature of the 'function'. + * * (Design sidenote: we use this instead of a + * * TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.) + * * + * + * TP_ARGS(rq, prev, next), + * + * * + * * Fast binary tracing: define the trace record via + * * TP_STRUCT__entry(). You can think about it like a + * * regular C structure local variable definition. + * * + * * This is how the trace record is structured and will + * * be saved into the ring buffer. These are the fields + * * that will be exposed to user-space in + * * /debug/tracing/events/<*>/format. + * * + * * The declared 'local variable' is called '__entry' + * * + * * __field(pid_t, prev_prid) is equivalent to a standard declariton: + * * + * * pid_t prev_pid; + * * + * * __array(char, prev_comm, TASK_COMM_LEN) is equivalent to: + * * + * * char prev_comm[TASK_COMM_LEN]; + * * + * + * TP_STRUCT__entry( + * __array( char, prev_comm, TASK_COMM_LEN ) + * __field( pid_t, prev_pid ) + * __field( int, prev_prio ) + * __array( char, next_comm, TASK_COMM_LEN ) + * __field( pid_t, next_pid ) + * __field( int, next_prio ) + * ), + * + * * + * * Assign the entry into the trace record, by embedding + * * a full C statement block into TP_fast_assign(). You + * * can refer to the trace record as '__entry' - + * * otherwise you can put arbitrary C code in here. + * * + * * Note: this C code will execute every time a trace event + * * happens, on an active tracepoint. + * * + * + * TP_fast_assign( + * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + * __entry->prev_pid = prev->pid; + * __entry->prev_prio = prev->prio; + * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + * __entry->next_pid = next->pid; + * __entry->next_prio = next->prio; + * ) + * + * * + * * Formatted output of a trace record via TP_printk(). + * * This is how the tracepoint will appear under ftrace + * * plugins that make use of this tracepoint. + * * + * * (raw-binary tracing wont actually perform this step.) + * * + * + * TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + * __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + * __entry->next_comm, __entry->next_pid, __entry->next_prio), + * + * ); + * + * This macro construct is thus used for the regular printk format + * tracing setup, it is used to construct a function pointer based + * tracepoint callback (this is used by programmatic plugins and + * can also by used by generic instrumentation like SystemTap), and + * it is also used to expose a structured trace record in + * /debug/tracing/events/. + */ + #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -- cgit v1.2.3-59-g8ed1b From 0e3d0f0566f3fcf664782f597070bbc669d78454 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 13:12:58 -0400 Subject: tracing: update comments to match event code macros Impact: clean up / comments The comments that described the ftrace macros to manipulate the TRACE_EVENT and TRACE_FORMAT macros no longer match the code. This patch updates them. Reported-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_1.h | 6 ++++-- kernel/trace/trace_events_stage_2.h | 9 ++++----- kernel/trace/trace_events_stage_3.h | 8 ++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 82f68443c556..38985f9b379c 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -6,11 +6,13 @@ * struct ftrace_raw_ { * struct trace_entry ent; * ; + * []; * [...] * }; * - * The is created by the TRACE_FIELD(type, item, assign) - * macro. We simply do "type item;", and that will create the fields + * The is created by the __field(type, item) macro or + * the __array(type2, item2, len) macro. + * We simply do "type item;", and that will create the fields * in the structure. */ diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 1ad9f8d2fe45..ca347afd6aa0 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -20,7 +20,7 @@ * * field = (typeof(field))entry; * - * ret = trace_seq_printf(s, "%s", "\n"); + * ret = trace_seq_printf(s, "\n"); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -76,10 +76,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ * int ret; * * ret = trace_seq_printf(s, #type " " #item ";" - * " size:%d; offset:%d;\n", - * sizeof(field.type), - * offsetof(struct ftrace_raw_##call, - * item)); + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); * * } */ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index d6de06b9201a..6ee1de59f19d 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -56,7 +56,8 @@ * return; * entry = ring_buffer_event_data(event); * - * ; <-- Here we assign the entries by the TRACE_FIELD. + * ; <-- Here we assign the entries by the __field and + * __array macros. * * trace_current_buffer_unlock_commit(event, irq_flags, pc); * } @@ -96,11 +97,10 @@ * __attribute__((__aligned__(4))) * __attribute__((section("_ftrace_events"))) event_ = { * .name = "", + * .system = "", + * .raw_init = ftrace_raw_init_event_, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, - * .raw_init = ftrace_raw_init_event_, - * .raw_reg = ftrace_raw_reg_event_, - * .raw_unreg = ftrace_raw_unreg_event_, * .show_format = ftrace_format_, * } * -- cgit v1.2.3-59-g8ed1b From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 14:10:56 -0400 Subject: tracing: remove funky whitespace in the trace code Impact: clean up There existed a lot of 's in the tracing code. This patch removes them. Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 16 +++--- kernel/trace/blktrace.c | 10 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_events_stage_3.h | 98 ++++++++++++++++++------------------ kernel/trace/trace_export.c | 2 +- kernel/trace/trace_functions_graph.c | 6 +-- kernel/trace/trace_output.c | 14 +++--- kernel/trace/trace_workqueue.c | 6 +-- 9 files changed, 78 insertions(+), 78 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 119ece224c21..d35a7ee7611f 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -178,8 +178,8 @@ static inline void tracepoint_synchronize_unregister(void) * * prototype, declare it via TP_PROTO(): * * * - * TP_PROTO(struct rq *rq, struct task_struct *prev, - * struct task_struct *next), + * TP_PROTO(struct rq *rq, struct task_struct *prev, + * struct task_struct *next), * * * * * Define the call signature of the 'function'. @@ -187,7 +187,7 @@ static inline void tracepoint_synchronize_unregister(void) * * TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.) * * * - * TP_ARGS(rq, prev, next), + * TP_ARGS(rq, prev, next), * * * * * Fast binary tracing: define the trace record via @@ -229,13 +229,13 @@ static inline void tracepoint_synchronize_unregister(void) * * happens, on an active tracepoint. * * * - * TP_fast_assign( - * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - * __entry->prev_pid = prev->pid; - * __entry->prev_prio = prev->prio; + * TP_fast_assign( + * memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + * __entry->prev_pid = prev->pid; + * __entry->prev_prio = prev->prio; * memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); * __entry->next_pid = next->pid; - * __entry->next_prio = next->prio; + * __entry->next_prio = next->prio; * ) * * * diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e39679a72a3b..bec69d3678c1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -33,7 +33,7 @@ static struct trace_array *blk_tr; static int __read_mostly blk_tracer_enabled; /* Select an alternative, minimalistic output than the original one */ -#define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CLASSIC 0x1 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ @@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); /** * blk_trace_ioctl: - handle the ioctls associated with tracing * @bdev: the block device - * @cmd: the ioctl cmd + * @cmd: the ioctl cmd * @arg: the argument data, if any * **/ @@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr) static struct { const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); + int (*print)(struct trace_seq *s, const struct trace_entry *ent); } what2act[] __read_mostly = { - [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, @@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = { }; static struct trace_event trace_blk_event = { - .type = TRACE_BLK, + .type = TRACE_BLK, .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cc94f8642485..8c6a902db40a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->tgid = (tsk) ? tsk->tgid : 0; + entry->tgid = (tsk) ? tsk->tgid : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index aaa0755268b9..ad8c22efff41 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, static struct trace_event trace_branch_event = { - .type = TRACE_BRANCH, + .type = TRACE_BRANCH, .trace = trace_branch_print, }; diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 6ee1de59f19d..ae2e323df0c7 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -5,23 +5,23 @@ * * static void ftrace_event_(proto) * { - * event_trace_printk(_RET_IP_, ": " ); + * event_trace_printk(_RET_IP_, ": " ); * } * * static int ftrace_reg_event_(void) * { - * int ret; + * int ret; * - * ret = register_trace_(ftrace_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; + * ret = register_trace_(ftrace_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; * } * * static void ftrace_unreg_event_(void) * { - * unregister_trace_(ftrace_event_); + * unregister_trace_(ftrace_event_); * } * * For those macros defined with TRACE_FORMAT: @@ -29,9 +29,9 @@ * static struct ftrace_event_call __used * __attribute__((__aligned__(4))) * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, + * .name = "", + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, * } * * @@ -41,66 +41,66 @@ * * static void ftrace_raw_event_(proto) * { - * struct ring_buffer_event *event; - * struct ftrace_raw_ *entry; <-- defined in stage 1 - * unsigned long irq_flags; - * int pc; - * - * local_save_flags(irq_flags); - * pc = preempt_count(); - * - * event = trace_current_buffer_lock_reserve(event_.id, - * sizeof(struct ftrace_raw_), - * irq_flags, pc); - * if (!event) - * return; - * entry = ring_buffer_event_data(event); - * - * ; <-- Here we assign the entries by the __field and + * struct ring_buffer_event *event; + * struct ftrace_raw_ *entry; <-- defined in stage 1 + * unsigned long irq_flags; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * event = trace_current_buffer_lock_reserve(event_.id, + * sizeof(struct ftrace_raw_), + * irq_flags, pc); + * if (!event) + * return; + * entry = ring_buffer_event_data(event); + * + * ; <-- Here we assign the entries by the __field and * __array macros. * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); + * trace_current_buffer_unlock_commit(event, irq_flags, pc); * } * * static int ftrace_raw_reg_event_(void) * { - * int ret; + * int ret; * - * ret = register_trace_(ftrace_raw_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; + * ret = register_trace_(ftrace_raw_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; * } * * static void ftrace_unreg_event_(void) * { - * unregister_trace_(ftrace_raw_event_); + * unregister_trace_(ftrace_raw_event_); * } * * static struct trace_event ftrace_event_type_ = { - * .trace = ftrace_raw_output_, <-- stage 2 + * .trace = ftrace_raw_output_, <-- stage 2 * }; * * static int ftrace_raw_init_event_(void) * { - * int id; + * int id; * - * id = register_ftrace_event(&ftrace_event_type_); - * if (!id) - * return -ENODEV; - * event_.id = id; - * return 0; + * id = register_ftrace_event(&ftrace_event_type_); + * if (!id) + * return -ENODEV; + * event_.id = id; + * return 0; * } * * static struct ftrace_event_call __used * __attribute__((__aligned__(4))) * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", + * .name = "", * .system = "", - * .raw_init = ftrace_raw_init_event_, - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, + * .raw_init = ftrace_raw_init_event_, + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, * .show_format = ftrace_format_, * } * @@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ + .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ @@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto) \ pc = preempt_count(); \ \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ + sizeof(struct ftrace_raw_##call), \ irq_flags, pc); \ if (!event) \ return; \ @@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void) \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ + .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event_##call, \ .regfunc = ftrace_raw_reg_event_##call, \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 23ae78430d58..4d9952d3df50 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s) \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ + .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ .show_format = ftrace_format_##call, \ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 453ebd3b636e..d1493b853e41 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter) } static struct tracer graph_trace __read_mostly = { - .name = "function_graph", + .name = "function_graph", .open = graph_trace_open, .close = graph_trace_close, .wait_pipe = poll_wait_pipe, - .init = graph_trace_init, - .reset = graph_trace_reset, + .init = graph_trace_init, + .reset = graph_trace_reset, .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ef8fd661b217..491832af9ba1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) } static struct trace_event trace_fn_event = { - .type = TRACE_FN, + .type = TRACE_FN, .trace = trace_fn_trace, .raw = trace_fn_raw, .hex = trace_fn_hex, @@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, } static struct trace_event trace_ctx_event = { - .type = TRACE_CTX, + .type = TRACE_CTX, .trace = trace_ctx_print, .raw = trace_ctx_raw, .hex = trace_ctx_hex, @@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = { }; static struct trace_event trace_wake_event = { - .type = TRACE_WAKE, + .type = TRACE_WAKE, .trace = trace_wake_print, .raw = trace_wake_raw, .hex = trace_wake_hex, @@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter, } static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, + .type = TRACE_SPECIAL, .trace = trace_special_print, .raw = trace_special_print, .hex = trace_special_hex, @@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, } static struct trace_event trace_stack_event = { - .type = TRACE_STACK, + .type = TRACE_STACK, .trace = trace_stack_print, .raw = trace_special_print, .hex = trace_special_hex, @@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, } static struct trace_event trace_user_stack_event = { - .type = TRACE_USER_STACK, + .type = TRACE_USER_STACK, .trace = trace_user_stack_print, .raw = trace_special_print, .hex = trace_special_hex, @@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) static struct trace_event trace_print_event = { - .type = TRACE_PRINT, + .type = TRACE_PRINT, .trace = trace_print_print, .raw = trace_print_raw, }; diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 4664990fe9c5..e542483df623 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -19,14 +19,14 @@ struct cpu_workqueue_stats { /* Useful to know if we print the cpu headers */ bool first_entry; int cpu; - pid_t pid; + pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ - atomic_t inserted; + atomic_t inserted; /* * Don't need to be atomic, works are serialized in a single workqueue thread * on a single CPU. */ - unsigned int executed; + unsigned int executed; }; /* List of workqueue threads on one cpu */ -- cgit v1.2.3-59-g8ed1b From 6cc3c6e12bb039047974ad2e7e2d46d15a1b762f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Mar 2009 19:03:43 +0100 Subject: trace_clock: fix preemption bug Using the function_graph tracer in recent kernels generates a spew of preemption BUGs. Fix this by not requiring trace_clock_local() users to disable preemption themselves. Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_clock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 2d4953f93560..05b176abfd30 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -27,12 +27,19 @@ */ u64 notrace trace_clock_local(void) { + unsigned long flags; + u64 clock; + /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - return sched_clock(); + raw_local_irq_save(flags); + clock = sched_clock(); + raw_local_irq_restore(flags); + + return clock; } /* -- cgit v1.2.3-59-g8ed1b From 80370cb758e7ca2692cd9fb5e413d970b1f4b2b2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 10 Mar 2009 17:16:35 -0400 Subject: tracing: use raw spinlocks for trace_vprintk Impact: prevent locking up by lockdep tracer The lockdep tracer uses trace_vprintk and thus trace_vprintk can not call back into lockdep without locking up. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8c6a902db40a..4c97947650ae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1176,7 +1176,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace) */ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) { - static DEFINE_SPINLOCK(trace_buf_lock); + static raw_spinlock_t trace_buf_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; struct ring_buffer_event *event; @@ -1201,7 +1202,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) if (unlikely(atomic_read(&data->disabled))) goto out; - spin_lock_irqsave(&trace_buf_lock, flags); + /* Lockdep uses trace_printk for lock tracing */ + local_irq_save(flags); + __raw_spin_lock(&trace_buf_lock); len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); if (len > TRACE_BUF_SIZE || len < 0) @@ -1220,7 +1223,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) ring_buffer_unlock_commit(tr->buffer, event); out_unlock: - spin_unlock_irqrestore(&trace_buf_lock, flags); + __raw_spin_unlock(&trace_buf_lock); + local_irq_restore(flags); out: ftrace_preempt_enable(resched); -- cgit v1.2.3-59-g8ed1b From 73c5162aa362a543793f4a957c6c536dcbaa89ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 13:42:01 -0400 Subject: tracing: keep ring buffer to minimum size till used Impact: less memory impact on systems not using tracer When the kernel boots up that has tracing configured, it allocates the default size of the ring buffer. This currently happens to be 1.4Megs per possible CPU. This is quite a bit of wasted memory if the system is never using the tracer. The current solution is to keep the ring buffers to a minimum size until the user uses them. Once a tracer is piped into the current_tracer the ring buffer will be expanded to the default size. If the user changes the size of the ring buffer, it will take the size given by the user immediately. If the user adds a "ftrace=" to the kernel command line, then the ring buffers will be set to the default size on initialization. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 79 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4c97947650ae..0c1dc1850858 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -44,6 +44,12 @@ unsigned long __read_mostly tracing_max_latency; unsigned long __read_mostly tracing_thresh; +/* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ +static int ring_buffer_expanded; + /* * We need to change this state when a selftest is running. * A selftest will lurk into the ring-buffer to count the @@ -128,6 +134,8 @@ static int __init set_ftrace(char *str) { strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; + /* We are using ftrace early, expand it */ + ring_buffer_expanded = 1; return 1; } __setup("ftrace=", set_ftrace); @@ -2315,6 +2323,40 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } +static int tracing_resize_ring_buffer(unsigned long size) +{ + int ret; + + /* + * If kernel or user changes the size of the ring buffer + * it get completed. + */ + ring_buffer_expanded = 1; + + ret = ring_buffer_resize(global_trace.buffer, size); + if (ret < 0) + return ret; + + ret = ring_buffer_resize(max_tr.buffer, size); + if (ret < 0) { + int r; + + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* AARGH! We are left with different + * size max buffer!!!! */ + WARN_ON(1); + tracing_disabled = 1; + } + return ret; + } + + global_trace.entries = size; + + return ret; +} + struct trace_option_dentry; static struct trace_option_dentry * @@ -2330,6 +2372,13 @@ static int tracing_set_tracer(const char *buf) struct tracer *t; int ret = 0; + if (!ring_buffer_expanded) { + ret = tracing_resize_ring_buffer(trace_buf_size); + if (ret < 0) + return ret; + ret = 0; + } + mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) @@ -2903,28 +2952,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, val <<= 10; if (val != global_trace.entries) { - ret = ring_buffer_resize(global_trace.buffer, val); + ret = tracing_resize_ring_buffer(val); if (ret < 0) { cnt = ret; goto out; } - - ret = ring_buffer_resize(max_tr.buffer, val); - if (ret < 0) { - int r; - cnt = ret; - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); - if (r < 0) { - /* AARGH! We are left with different - * size max buffer!!!! */ - WARN_ON(1); - tracing_disabled = 1; - } - goto out; - } - - global_trace.entries = val; } filp->f_pos += cnt; @@ -3916,6 +3948,7 @@ void ftrace_dump(void) __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; + int ring_buf_size; int i; int ret = -ENOMEM; @@ -3928,12 +3961,18 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; + /* To save memory, keep the ring buffer size to its minimum */ + if (ring_buffer_expanded) + ring_buf_size = trace_buf_size; + else + ring_buf_size = 1; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(trace_buf_size, + global_trace.buffer = ring_buffer_alloc(ring_buf_size, TRACE_BUFFER_FLAGS); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); @@ -3944,7 +3983,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(trace_buf_size, + max_tr.buffer = ring_buffer_alloc(ring_buf_size, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); -- cgit v1.2.3-59-g8ed1b From 1852fcce181faa237c010a3dbedb473cf9d4555f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 14:33:00 -0400 Subject: tracing: expand the ring buffers when an event is activated To save memory, the tracer ring buffers are set to a minimum. The activating of a trace expands the ring buffer size. This patch adds this expanding, when an event is activated. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 20 ++++++++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_events.c | 8 ++++++++ 3 files changed, 31 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0c1dc1850858..35ee63ae4122 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2357,6 +2357,26 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } +/** + * tracing_update_buffers - used by tracing facility to expand ring buffers + * + * To save on memory when the tracing is never used on a system with it + * configured in. The ring buffers are set to a minimum size. But once + * a user starts to use the tracing facility, then they need to grow + * to their default size. + * + * This function is to be called when a tracer is about to be used. + */ +int tracing_update_buffers(void) +{ + int ret = 0; + + if (!ring_buffer_expanded) + ret = tracing_resize_ring_buffer(trace_buf_size); + + return ret; +} + struct trace_option_dentry; static struct trace_option_dentry * diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c5e1d8865fe4..336324d717f8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -737,6 +737,9 @@ static inline void trace_branch_disable(void) } #endif /* CONFIG_BRANCH_TRACER */ +/* set ring buffers to default size if not already done so */ +int tracing_update_buffers(void); + /* trace event type bit fields, not numeric */ enum { TRACE_EVENT_TYPE_PRINTF = 1, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 769dfd00fc85..ca624df73591 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -141,6 +141,10 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (!cnt || cnt < 0) return 0; + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + ret = get_user(ch, ubuf++); if (ret) return ret; @@ -331,6 +335,10 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + switch (val) { case 0: case 1: -- cgit v1.2.3-59-g8ed1b From 9aba60fe6eb20453de53a572143bef22fa929fba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 19:52:30 -0400 Subject: tracing: fix trace_wait to know to wait on all cpus or just one Impact: fix to task live locking on reading trace_pipe on one CPU The same code is used for both trace_pipe (all CPUS) and the per_cpu trace_pipe file. When there is no data to read, it will check for signals and wait on the trace wait queue. The problem happens with the per_cpu wait. The trace_wait code checks all CPUs. Thus, if there's data in another CPU buffer, then it will exit the wait, without checking for signals or waiting on the wait queue. It would then try to read the empty buffer, and since that will just return nothing, then it will try to wait again. Unfortunately, that will again fail due to there still being data in the other buffers. This ends up with a live lock for the task. This patch fixes the trace_wait to be aware that the iterator may only be waiting on a single buffer. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 35ee63ae4122..e60f4be10d64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1666,6 +1666,19 @@ static int trace_empty(struct trace_iterator *iter) { int cpu; + /* If we are looking at one CPU buffer, only check that one */ + if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + cpu = iter->cpu_file; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } + return 1; + } + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) { if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) -- cgit v1.2.3-59-g8ed1b From 554f786e284a6ce859d51f62240d615603944c8e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 11 Mar 2009 22:00:13 -0400 Subject: ring-buffer: only allocate buffers for online cpus Impact: save on memory Currently, a ring buffer was allocated for each "possible_cpus". On some systems, this is the same as NR_CPUS. Thus, if a system defined NR_CPUS = 64 but it only had 1 CPU, we could have possibly 63 useless ring buffers taking up space. With a default buffer of 3 megs, this could be quite drastic. This patch changes the ring buffer code to only allocate ring buffers for online CPUs. If a CPU goes off line, we do not free the buffer. This is because the user may still have trace data in that buffer that they would like to look at. Perhaps in the future we could add code to delete a ring buffer if the CPU is offline and the ring buffer becomes empty. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++++++++++++++--------- kernel/trace/trace.c | 6 - 2 files changed, 216 insertions(+), 56 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 178858492a89..d07c2888396f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "trace.h" @@ -301,6 +302,10 @@ struct ring_buffer { struct mutex mutex; struct ring_buffer_per_cpu **buffers; + +#ifdef CONFIG_HOTPLUG + struct notifier_block cpu_notify; +#endif }; struct ring_buffer_iter { @@ -459,6 +464,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) */ extern int ring_buffer_page_too_big(void); +#ifdef CONFIG_HOTPLUG +static int __cpuinit rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); +#endif + /** * ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -496,7 +506,8 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; - cpumask_copy(buffer->cpumask, cpu_possible_mask); + get_online_cpus(); + cpumask_copy(buffer->cpumask, cpu_online_mask); buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -512,6 +523,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) goto fail_free_buffers; } +#ifdef CONFIG_HOTPLUG + buffer->cpu_notify.notifier_call = rb_cpu_notify; + buffer->cpu_notify.priority = 0; + register_cpu_notifier(&buffer->cpu_notify); +#endif + + put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -525,6 +543,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) fail_free_cpumask: free_cpumask_var(buffer->cpumask); + put_online_cpus(); fail_free_buffer: kfree(buffer); @@ -541,9 +560,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; + get_online_cpus(); + +#ifdef CONFIG_HOTPLUG + unregister_cpu_notifier(&buffer->cpu_notify); +#endif + for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); + put_online_cpus(); + free_cpumask_var(buffer->cpumask); kfree(buffer); @@ -649,16 +676,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) return size; mutex_lock(&buffer->mutex); + get_online_cpus(); nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); if (size < buffer_size) { /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) + goto out_fail; rm_pages = buffer->pages - nr_pages; @@ -677,10 +703,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) * add these pages to the cpu_buffers. Otherwise we just free * them all and return -ENOMEM; */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) + goto out_fail; new_pages = nr_pages - buffer->pages; @@ -705,13 +729,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) rb_insert_pages(cpu_buffer, &pages, new_pages); } - if (RB_WARN_ON(buffer, !list_empty(&pages))) { - mutex_unlock(&buffer->mutex); - return -1; - } + if (RB_WARN_ON(buffer, !list_empty(&pages))) + goto out_fail; out: buffer->pages = nr_pages; + put_online_cpus(); mutex_unlock(&buffer->mutex); return size; @@ -721,8 +744,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) list_del_init(&bpage->list); free_buffer_page(bpage); } + put_online_cpus(); mutex_unlock(&buffer->mutex); return -ENOMEM; + + /* + * Something went totally wrong, and we are too paranoid + * to even clean up the mess. + */ + out_fail: + put_online_cpus(); + mutex_unlock(&buffer->mutex); + return -1; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1528,11 +1561,15 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); + out: + put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); @@ -1548,11 +1585,15 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); + out: + put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); @@ -1564,12 +1605,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret = 0; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; + goto out; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->entries; + ret = cpu_buffer->entries; + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -1581,12 +1629,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret = 0; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 0; + goto out; cpu_buffer = buffer->buffers[cpu]; - return cpu_buffer->overrun; + ret = cpu_buffer->overrun; + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); @@ -1603,12 +1658,16 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) unsigned long entries = 0; int cpu; + get_online_cpus(); + /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += cpu_buffer->entries; } + put_online_cpus(); + return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); @@ -1626,12 +1685,16 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) unsigned long overruns = 0; int cpu; + get_online_cpus(); + /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += cpu_buffer->overrun; } + put_online_cpus(); + return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); @@ -1663,9 +1726,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) */ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) { - struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -1900,9 +1968,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) struct buffer_page *reader; int nr_loops = 0; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; - cpu_buffer = buffer->buffers[cpu]; again: @@ -2028,13 +2093,21 @@ struct ring_buffer_event * ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; + struct ring_buffer_event *event = NULL; unsigned long flags; + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return event; } @@ -2071,24 +2144,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event * ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; unsigned long flags; + /* might be called in atomic */ + preempt_disable(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; + goto out; + cpu_buffer = buffer->buffers[cpu]; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); if (!event) - goto out; + goto out_unlock; rb_advance_reader(cpu_buffer); - out: + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + preempt_enable(); + return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); @@ -2109,15 +2189,17 @@ struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_iter *iter; + struct ring_buffer_iter *iter = NULL; unsigned long flags; + get_online_cpus(); + if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return NULL; + goto out; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return NULL; + goto out; cpu_buffer = buffer->buffers[cpu]; @@ -2132,6 +2214,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -2224,9 +2309,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; + int resched; + + /* Can't use get_online_cpus because this can be in atomic */ + resched = ftrace_preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return; + goto out; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2237,6 +2326,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2246,10 +2337,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct ring_buffer *buffer) { + int resched; int cpu; + /* Can't use get_online_cpus because this can be in atomic */ + resched = ftrace_preempt_disable(); + for_each_buffer_cpu(buffer, cpu) ring_buffer_reset_cpu(buffer, cpu); + + ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset); @@ -2262,12 +2359,17 @@ int ring_buffer_empty(struct ring_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; + get_online_cpus(); + /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; if (!rb_per_cpu_empty(cpu_buffer)) return 0; } + + put_online_cpus(); + return 1; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -2280,12 +2382,20 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; + int ret = 1; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - return 1; + goto out; cpu_buffer = buffer->buffers[cpu]; - return rb_per_cpu_empty(cpu_buffer); + ret = rb_per_cpu_empty(cpu_buffer); + + out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); @@ -2304,32 +2414,37 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, { struct ring_buffer_per_cpu *cpu_buffer_a; struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; + + get_online_cpus(); if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - return -EINVAL; + goto out; /* At least make sure the two buffers are somewhat the same */ if (buffer_a->pages != buffer_b->pages) - return -EINVAL; + goto out; + + ret = -EAGAIN; if (ring_buffer_flags != RB_BUFFERS_ON) - return -EAGAIN; + goto out; if (atomic_read(&buffer_a->record_disabled)) - return -EAGAIN; + goto out; if (atomic_read(&buffer_b->record_disabled)) - return -EAGAIN; + goto out; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; if (atomic_read(&cpu_buffer_a->record_disabled)) - return -EAGAIN; + goto out; if (atomic_read(&cpu_buffer_b->record_disabled)) - return -EAGAIN; + goto out; /* * We can't do a synchronize_sched here because this @@ -2349,7 +2464,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); - return 0; + ret = 0; +out: + put_online_cpus(); + + return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -2464,27 +2583,32 @@ int ring_buffer_read_page(struct ring_buffer *buffer, u64 save_timestamp; int ret = -1; + get_online_cpus(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - return -1; + goto out; len -= BUF_PAGE_HDR_SIZE; if (!data_page) - return -1; + goto out; bpage = *data_page; if (!bpage) - return -1; + goto out; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out; + goto out_unlock; event = rb_reader_event(cpu_buffer); @@ -2506,7 +2630,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, unsigned int size; if (full) - goto out; + goto out_unlock; if (len > (commit - read)) len = (commit - read); @@ -2514,7 +2638,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, size = rb_event_length(event); if (len < size) - goto out; + goto out_unlock; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -2553,9 +2677,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } ret = read; - out: + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + out: + put_online_cpus(); + return ret; } @@ -2629,3 +2756,42 @@ static __init int rb_init_debugfs(void) } fs_initcall(rb_init_debugfs); + +#ifdef CONFIG_HOTPLUG +static int __cpuinit rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct ring_buffer *buffer = + container_of(self, struct ring_buffer, cpu_notify); + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (cpu_isset(cpu, *buffer->cpumask)) + return NOTIFY_OK; + + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %ld\n", + cpu); + return NOTIFY_OK; + } + smp_wmb(); + cpu_set(cpu, *buffer->cpumask); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Do nothing. + * If we were to free the buffer, then the user would + * lose any trace that was in the buffer. + */ + break; + default: + break; + } + return NOTIFY_OK; +} +#endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e60f4be10d64..14c98f6a47bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1805,17 +1805,11 @@ __tracing_open(struct inode *inode, struct file *file) iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); - - if (!iter->buffer_iter[cpu]) - goto fail_buffer; } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = ring_buffer_read_start(iter->tr->buffer, cpu); - - if (!iter->buffer_iter[cpu]) - goto fail; } /* TODO stop tracer */ -- cgit v1.2.3-59-g8ed1b From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 01:30:40 +0100 Subject: locking: rename trace_softirq_[enter|exit] => lockdep_softirq_[enter|exit] Impact: cleanup The naming clashes with upcoming softirq tracepoints, so rename the APIs to lockdep_*(). Requested-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 8 ++++---- kernel/softirq.c | 4 ++-- lib/locking-selftest.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 74bde13224c9..b02a3f1d46a0 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -24,8 +24,8 @@ # define trace_softirqs_enabled(p) ((p)->softirqs_enabled) # define trace_hardirq_enter() do { current->hardirq_context++; } while (0) # define trace_hardirq_exit() do { current->hardirq_context--; } while (0) -# define trace_softirq_enter() do { current->softirq_context++; } while (0) -# define trace_softirq_exit() do { current->softirq_context--; } while (0) +# define lockdep_softirq_enter() do { current->softirq_context++; } while (0) +# define lockdep_softirq_exit() do { current->softirq_context--; } while (0) # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, #else # define trace_hardirqs_on() do { } while (0) @@ -38,8 +38,8 @@ # define trace_softirqs_enabled(p) 0 # define trace_hardirq_enter() do { } while (0) # define trace_hardirq_exit() do { } while (0) -# define trace_softirq_enter() do { } while (0) -# define trace_softirq_exit() do { } while (0) +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) # define INIT_TRACE_IRQFLAGS #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 9041ea7948fe..08a030f85416 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void) account_system_vtime(current); __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); + lockdep_softirq_enter(); cpu = smp_processor_id(); restart: @@ -220,7 +220,7 @@ restart: if (pending) wakeup_softirqd(); - trace_softirq_exit(); + lockdep_softirq_exit(); account_system_vtime(current); _local_bh_enable(); diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 280332c1827c..619313ed6c46 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -157,11 +157,11 @@ static void init_shared_classes(void) #define SOFTIRQ_ENTER() \ local_bh_disable(); \ local_irq_disable(); \ - trace_softirq_enter(); \ + lockdep_softirq_enter(); \ WARN_ON(!in_softirq()); #define SOFTIRQ_EXIT() \ - trace_softirq_exit(); \ + lockdep_softirq_exit(); \ local_irq_enable(); \ local_bh_enable(); -- cgit v1.2.3-59-g8ed1b From a123c52b46a1f84bcec3dc963351896c6d6afaf7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:21:08 -0400 Subject: tracing: fix comments about trace buffer resizing Impact: cleanup Some of the comments about the trace buffer resizing is gobbledygook. And I wonder why people question if I'm a native English speaker. This patch makes the comments make a bit more sense. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c3946a6df34e..c61ee85c50bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2336,7 +2336,8 @@ static int tracing_resize_ring_buffer(unsigned long size) /* * If kernel or user changes the size of the ring buffer - * it get completed. + * we use the size that was given, and we can forget about + * expanding it later. */ ring_buffer_expanded = 1; @@ -2351,8 +2352,20 @@ static int tracing_resize_ring_buffer(unsigned long size) r = ring_buffer_resize(global_trace.buffer, global_trace.entries); if (r < 0) { - /* AARGH! We are left with different - * size max buffer!!!! */ + /* + * AARGH! We are left with different + * size max buffer!!!! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the size of the main buffer, but failed to + * update the size of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ WARN_ON(1); tracing_disabled = 1; } -- cgit v1.2.3-59-g8ed1b From 1027fcb206a0fb8348e63aff078c74bdee1c2698 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:33:20 -0400 Subject: tracing: protect ring_buffer_expanded with trace_types_lock Impact: prevent races with ring_buffer_expanded This patch places the expanding of the tracing buffer under the protection of the trace_types_lock mutex. It is highly unlikely that there would be any contention, but better safe than sorry. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c61ee85c50bb..04ab8243a13d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2391,8 +2391,10 @@ int tracing_update_buffers(void) { int ret = 0; + mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) ret = tracing_resize_ring_buffer(trace_buf_size); + mutex_unlock(&trace_types_lock); return ret; } @@ -2412,6 +2414,8 @@ static int tracing_set_tracer(const char *buf) struct tracer *t; int ret = 0; + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) { ret = tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) @@ -2419,7 +2423,6 @@ static int tracing_set_tracer(const char *buf) ret = 0; } - mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { if (strcmp(t->name, buf) == 0) break; -- cgit v1.2.3-59-g8ed1b From 59222efe2d184956464abe5b637bc842ff053b93 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 11:46:03 -0400 Subject: ring-buffer: use CONFIG_HOTPLUG_CPU not CONFIG_HOTPLUG The hotplug code in the ring buffers is for use with CPU hotplug, not generic hotplug. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d07c2888396f..035b56c3a6c9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -303,7 +303,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif }; @@ -464,7 +464,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) */ extern int ring_buffer_page_too_big(void); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu); #endif @@ -523,7 +523,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) goto fail_free_buffers; } -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; register_cpu_notifier(&buffer->cpu_notify); @@ -562,7 +562,7 @@ ring_buffer_free(struct ring_buffer *buffer) get_online_cpus(); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU unregister_cpu_notifier(&buffer->cpu_notify); #endif @@ -2757,7 +2757,7 @@ static __init int rb_init_debugfs(void) fs_initcall(rb_init_debugfs); -#ifdef CONFIG_HOTPLUG +#ifdef CONFIG_HOTPLUG_CPU static int __cpuinit rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -- cgit v1.2.3-59-g8ed1b From 8aabee573dff131a085c63de7667eacd94ba4ccb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 13:13:49 -0400 Subject: ring-buffer: remove unneeded get_online_cpus Impact: speed up and remove possible races The get_online_cpus was added to the ring buffer because the original design would free the ring buffer on a CPU that was being taken off line. The final design kept the ring buffer around even when the CPU was taken off line. This is to allow a user to still read the information on that ring buffer. Most of the get_online_cpus are no longer needed since the ring buffer will not disappear from the use cases. Reported-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 90 ++++++++-------------------------------------- 1 file changed, 14 insertions(+), 76 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 035b56c3a6c9..2c36be9fac2e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1561,15 +1561,11 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; cpu_buffer = buffer->buffers[cpu]; atomic_inc(&cpu_buffer->record_disabled); - out: - put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); @@ -1585,15 +1581,11 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; cpu_buffer = buffer->buffers[cpu]; atomic_dec(&cpu_buffer->record_disabled); - out: - put_online_cpus(); } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); @@ -1605,17 +1597,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret = 0; - - get_online_cpus(); + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 0; cpu_buffer = buffer->buffers[cpu]; ret = cpu_buffer->entries; - out: - put_online_cpus(); return ret; } @@ -1629,17 +1617,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret = 0; - - get_online_cpus(); + unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 0; cpu_buffer = buffer->buffers[cpu]; ret = cpu_buffer->overrun; - out: - put_online_cpus(); return ret; } @@ -1658,16 +1642,12 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) unsigned long entries = 0; int cpu; - get_online_cpus(); - /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; entries += cpu_buffer->entries; } - put_online_cpus(); - return entries; } EXPORT_SYMBOL_GPL(ring_buffer_entries); @@ -1685,16 +1665,12 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) unsigned long overruns = 0; int cpu; - get_online_cpus(); - /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; overruns += cpu_buffer->overrun; } - put_online_cpus(); - return overruns; } EXPORT_SYMBOL_GPL(ring_buffer_overruns); @@ -2093,21 +2069,16 @@ struct ring_buffer_event * ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_event *event = NULL; + struct ring_buffer_event *event; unsigned long flags; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return NULL; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - put_online_cpus(); - return event; } @@ -2189,17 +2160,15 @@ struct ring_buffer_iter * ring_buffer_read_start(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_iter *iter = NULL; + struct ring_buffer_iter *iter; unsigned long flags; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return NULL; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - goto out; + return NULL; cpu_buffer = buffer->buffers[cpu]; @@ -2214,9 +2183,6 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - put_online_cpus(); - return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -2309,13 +2275,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; unsigned long flags; - int resched; - - /* Can't use get_online_cpus because this can be in atomic */ - resched = ftrace_preempt_disable(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); @@ -2326,8 +2288,6 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - out: - ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -2337,16 +2297,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); */ void ring_buffer_reset(struct ring_buffer *buffer) { - int resched; int cpu; - /* Can't use get_online_cpus because this can be in atomic */ - resched = ftrace_preempt_disable(); - for_each_buffer_cpu(buffer, cpu) ring_buffer_reset_cpu(buffer, cpu); - - ftrace_preempt_enable(resched); } EXPORT_SYMBOL_GPL(ring_buffer_reset); @@ -2359,8 +2313,6 @@ int ring_buffer_empty(struct ring_buffer *buffer) struct ring_buffer_per_cpu *cpu_buffer; int cpu; - get_online_cpus(); - /* yes this is racy, but if you don't like the race, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -2368,8 +2320,6 @@ int ring_buffer_empty(struct ring_buffer *buffer) return 0; } - put_online_cpus(); - return 1; } EXPORT_SYMBOL_GPL(ring_buffer_empty); @@ -2382,18 +2332,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty); int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - int ret = 1; - - get_online_cpus(); + int ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return 1; cpu_buffer = buffer->buffers[cpu]; ret = rb_per_cpu_empty(cpu_buffer); - out: - put_online_cpus(); return ret; } @@ -2416,8 +2362,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, struct ring_buffer_per_cpu *cpu_buffer_b; int ret = -EINVAL; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; @@ -2466,8 +2410,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, ret = 0; out: - put_online_cpus(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -2583,8 +2525,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, u64 save_timestamp; int ret = -1; - get_online_cpus(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) goto out; @@ -2681,8 +2621,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: - put_online_cpus(); - return ret; } -- cgit v1.2.3-59-g8ed1b From db526ca329f855510e8ce672332eba3304aed590 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 13:53:25 -0400 Subject: tracing: show that buffer size is not expanded Impact: do not confuse user on small trace buffer sizes When the system boots up, the trace buffer is small to conserve memory. It is only two pages per online CPU. When the tracer is used, it expands to the default value. This can confuse the user if they look at the buffer size and see only 7, but then later they see 1408. # cat /debug/tracing/buffer_size_kb 7 # echo sched_switch > /debug/tracing/current_tracer # cat /debug/tracing/buffer_size_kb 1408 This patch tries to help remove this confustion by showing that the buffer has not been expanded. # cat /debug/tracing/buffer_size_kb 7 (expanded: 1408) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 04ab8243a13d..62a63b2b33dd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2948,10 +2948,18 @@ tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - char buf[64]; + char buf[96]; int r; - r = sprintf(buf, "%lu\n", tr->entries >> 10); + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + tr->entries >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", tr->entries >> 10); + mutex_unlock(&trace_types_lock); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -- cgit v1.2.3-59-g8ed1b From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 12 Mar 2009 18:24:49 +0100 Subject: tracing/core: bring back raw trace_printk for dynamic formats strings Impact: fix callsites with dynamic format strings Since its new binary implementation, trace_printk() internally uses static containers for the format strings on each callsites. But the value is assigned once at build time, which means that it can't take dynamic formats. So this patch unearthes the raw trace_printk implementation for the callers that will need trace_printk to be able to carry these dynamic format strings. The trace_printk() macro will use the appropriate implementation for each callsite. Most of the time however, the binary implementation will still be used. The other impact of this patch is that mmiotrace_printk() will use the old implementation because it calls the low level trace_vprintk and we can't guess here whether the format passed in it is dynamic or not. Some parts of this patch have been written by Steven Rostedt (most notably the part that chooses the appropriate implementation for each callsites). Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/linux/kernel.h | 40 +++++++++++------ kernel/trace/trace.c | 85 +++++++++++++++++++++++++++++++++--- kernel/trace/trace.h | 13 +++++- kernel/trace/trace_event_types.h | 11 ++++- kernel/trace/trace_functions_graph.c | 6 +-- kernel/trace/trace_mmiotrace.c | 7 +-- kernel/trace/trace_output.c | 57 +++++++++++++++++++++--- kernel/trace/trace_printk.c | 33 +++++++++++--- 8 files changed, 213 insertions(+), 39 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7742798c9208..1daca3b062bb 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -452,31 +452,45 @@ do { \ #define trace_printk(fmt, args...) \ do { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))); \ - \ - if (!trace_printk_fmt) \ - trace_printk_fmt = fmt; \ - \ __trace_printk_check_format(fmt, ##args); \ - __trace_printk(_THIS_IP_, trace_printk_fmt, ##args); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(_THIS_IP_, fmt, ##args); \ } while (0) +extern int +__trace_bprintk(unsigned long ip, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); + extern int __trace_printk(unsigned long ip, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement. + */ #define ftrace_vprintk(fmt, vargs) \ do { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))); \ - \ - if (!trace_printk_fmt) \ - trace_printk_fmt = fmt; \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ \ - __ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + } else \ + __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ } while (0) +extern int +__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); + extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62a63b2b33dd..dbb077d8a172 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace) /** - * trace_vprintk - write binary msg to tracing buffer + * trace_vbprintk - write binary msg to tracing buffer * */ -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct print_entry *entry; + struct bprint_entry *entry; unsigned long flags; int resched; int cpu, len = 0, size, pc; @@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; size = sizeof(*entry) + sizeof(u32) * len; - event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc); + event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); if (!event) goto out_unlock; entry = ring_buffer_event_data(event); @@ -1240,6 +1240,60 @@ out: return len; } +EXPORT_SYMBOL_GPL(trace_vbprintk); + +int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +{ + static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; + static char trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + if (unlikely(atomic_read(&data->disabled))) + goto out; + + pause_graph_tracing(); + raw_local_irq_save(irq_flags); + __raw_spin_lock(&trace_buf_lock); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + len = min(len, TRACE_BUF_SIZE-1); + trace_buf[len] = 0; + + size = sizeof(*entry) + len + 1; + event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->depth = depth; + + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = 0; + ring_buffer_unlock_commit(tr->buffer, event); + + out_unlock: + __raw_spin_unlock(&trace_buf_lock); + raw_local_irq_restore(irq_flags); + unpause_graph_tracing(); + out: + preempt_enable_notrace(); + + return len; +} EXPORT_SYMBOL_GPL(trace_vprintk); enum trace_file_type { @@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } +static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_bprintf(s, field->fmt, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_bprintf(s, field->fmt, field->buf); + ret = trace_seq_printf(s, "%s", field->buf); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return print_bprintk_msg_only(iter); + if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 336324d717f8..cede1ab49d07 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -20,6 +20,7 @@ enum trace_type { TRACE_WAKE, TRACE_STACK, TRACE_PRINT, + TRACE_BPRINT, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, @@ -117,7 +118,7 @@ struct userstack_entry { /* * trace_printk entry: */ -struct print_entry { +struct bprint_entry { struct trace_entry ent; unsigned long ip; int depth; @@ -125,6 +126,13 @@ struct print_entry { u32 buf[]; }; +struct print_entry { + struct trace_entry ent; + unsigned long ip; + int depth; + char buf[]; +}; + #define TRACE_OLD_SIZE 88 struct trace_field_cont { @@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ @@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern long ns2usecs(cycle_t nsec); extern int +trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args); +extern int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); extern unsigned long trace_flags; diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 5cca4c978bde..d0907d746425 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, +TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned int, depth, depth) @@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TP_RAW_FMT("%08lx (%d) fmt:%p %s") ); +TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, + TRACE_STRUCT( + TRACE_FIELD(unsigned long, ip, ip) + TRACE_FIELD(unsigned int, depth, depth) + TRACE_FIELD_ZERO_CHAR(buf) + ), + TP_RAW_FMT("%08lx (%d) fmt:%p %s") +); + TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8566c14b3e9a..4c388607ed67 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct print_entry *trace, struct trace_seq *s, +print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter) { int i; @@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter) trace_assign_type(field, entry); return print_graph_return(&field->ret, s, entry, iter); } - case TRACE_PRINT: { - struct print_entry *field; + case TRACE_BPRINT: { + struct bprint_entry *field; trace_assign_type(field, entry); return print_graph_comment(field, s, entry, iter); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 23e346a734ca..f095916e477f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); @@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_bprintf(s, print->fmt, print->buf); + ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 491832af9ba1..ea9d3b410c7a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = { .binary = trace_special_bin, }; -/* TRACE_PRINT */ +/* TRACE_BPRINT */ static enum print_line_t -trace_print_print(struct trace_iterator *iter, int flags) +trace_bprint_print(struct trace_iterator *iter, int flags) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; - struct print_entry *field; + struct bprint_entry *field; trace_assign_type(field, entry); @@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags) } -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t +trace_bprint_raw(struct trace_iterator *iter, int flags) { - struct print_entry *field; + struct bprint_entry *field; struct trace_seq *s = &iter->seq; trace_assign_type(field, iter->ent); @@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) } +static struct trace_event trace_bprint_event = { + .type = TRACE_BPRINT, + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +{ + struct print_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + static struct trace_event trace_print_event = { - .type = TRACE_PRINT, + .type = TRACE_PRINT, .trace = trace_print_print, .raw = trace_print_raw, }; + static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, @@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = { &trace_special_event, &trace_stack_event, &trace_user_stack_event, + &trace_bprint_event, &trace_print_event, NULL }; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a50aea22e929..f307a11e2332 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = { .notifier_call = module_trace_bprintk_format_notify, }; -int __trace_printk(unsigned long ip, const char *fmt, ...) +int __trace_bprintk(unsigned long ip, const char *fmt, ...) { int ret; va_list ap; @@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); va_end(ap); return ret; } -EXPORT_SYMBOL_GPL(__trace_printk); +EXPORT_SYMBOL_GPL(__trace_bprintk); -int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) { if (unlikely(!fmt)) return 0; @@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; + return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vbprintk); + +int __trace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); } EXPORT_SYMBOL_GPL(__ftrace_vprintk); - static __init int init_trace_printk(void) { return register_module_notifier(&module_trace_bprintk_format_nb); -- cgit v1.2.3-59-g8ed1b From 828275574e0161bdddb5817d4bd76a0265ef0470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:14:31 -0400 Subject: tracing: make bprint event use the proper event id The bprint record is using TRACE_PRINT when it should be TRACE_BPRINT. Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index d0907d746425..019915063fe6 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") ); -TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore, +TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) TRACE_FIELD(unsigned int, depth, depth) -- cgit v1.2.3-59-g8ed1b From e9fb2b6d5845e24f104713591286b6f39761c027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:19:25 -0400 Subject: tracing: have event_trace_printk use static tracer Impact: speed up on event tracing The event_trace_printk is currently a wrapper function that calls trace_vprintk. Because it uses a variable for the fmt it misses out on the optimization of using the binary printk. This patch makes event_trace_printk into a macro wrapper to use the fmt as the same as the trace_printks. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 17 +++++++++++++++++ kernel/trace/trace_events.c | 10 ---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cede1ab49d07..35cfa7bbaf38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -773,4 +773,21 @@ void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +extern const char *__start___trace_bprintk_fmt[]; +extern const char *__stop___trace_bprintk_fmt[]; + +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ca624df73591..238ea95a4115 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,16 +24,6 @@ static DEFINE_MUTEX(event_mutex); (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -void event_trace_printk(unsigned long ip, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - tracing_record_cmdline(current); - trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); - va_end(ap); -} - static void ftrace_clear_events(void) { struct ftrace_event_call *call = (void *)__start_ftrace_events; -- cgit v1.2.3-59-g8ed1b From 7975a2be16dd42df2cab80c80cb6ece382edb6ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 14:23:17 -0400 Subject: tracing: export trace formats to user space The binary printk saves a pointer to the format string in the ring buffer. On output, the format is processed. But if the user is reading the ring buffer through a binary interface, the pointer is meaningless. This patch creates a file called printk_formats that maps the pointers to the formats. # cat /debug/tracing/printk_formats 0xffffffff80713d40 : "irq_handler_entry: irq=%d handler=%s\n" 0xffffffff80713d48 : "lock_acquire: %s%s%s\n" 0xffffffff80713d50 : "lock_release: %s\n" Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f307a11e2332..486785214e3e 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -4,18 +4,19 @@ * Copyright (C) 2008 Lai Jiangshan * */ +#include +#include +#include #include #include #include +#include +#include +#include #include #include -#include #include -#include -#include #include -#include -#include #include "trace.h" @@ -153,6 +154,114 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + const char **fmt = m->private; + const char **next = fmt; + + (*pos)++; + + if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) + return NULL; + + next = fmt; + m->private = ++next; + + return fmt; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + return t_next(m, NULL, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + const char **fmt = v; + const char *str = *fmt; + int i; + + seq_printf(m, "0x%lx : \"", (unsigned long)fmt); + + /* + * Tabs and new lines need to be converted. + */ + for (i = 0; str[i]; i++) { + switch (str[i]) { + case '\n': + seq_puts(m, "\\n"); + break; + case '\t': + seq_puts(m, "\\t"); + break; + case '\\': + seq_puts(m, "\\"); + break; + case '"': + seq_puts(m, "\\\""); + break; + default: + seq_putc(m, str[i]); + } + } + seq_puts(m, "\"\n"); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations show_format_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static int +ftrace_formats_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &show_format_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = __start___trace_bprintk_fmt; + } + return ret; +} + +static const struct file_operations ftrace_formats_fops = { + .open = ftrace_formats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int init_trace_printk_function_export(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("printk_formats", 0444, d_tracer, + NULL, &ftrace_formats_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'printk_formats' entry\n"); + + return 0; +} + +fs_initcall(init_trace_printk_function_export); + static __init int init_trace_printk(void) { return register_module_notifier(&module_trace_bprintk_format_nb); -- cgit v1.2.3-59-g8ed1b From 2da03ecee6308ea174e8a02b92a3c4ec92e886c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 18:57:51 -0400 Subject: tracing: fix stack tracer header The stack tracer use to look like this: # cat /debug/tracing/stack_trace Depth Size Location (57 entries) ----- ---- -------- 0) 5088 16 mempool_alloc_slab+0x16/0x18 1) 5072 144 mempool_alloc+0x4d/0xfe 2) 4928 16 scsi_sg_alloc+0x48/0x4a [scsi_mod] Now it looks like this: # cat /debug/tracing/stack_trace Depth Size Location (57 entries) ----- ---- -------- 0) 5088 16 mempool_alloc_slab+0x16/0x18 1) 5072 144 mempool_alloc+0x4d/0xfe 2) 4928 16 scsi_sg_alloc+0x48/0x4a [scsi_mod] Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d0871bc0aca5..4564fd94b0cd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -251,9 +251,9 @@ static int t_show(struct seq_file *m, void *v) int size; if (v == SEQ_START_TOKEN) { - seq_printf(m, " Depth Size Location" + seq_printf(m, " Depth Size Location" " (%d entries)\n" - " ----- ---- --------\n", + " ----- ---- --------\n", max_stack_trace.nr_entries); return 0; } -- cgit v1.2.3-59-g8ed1b From e447e1df2e568cd43d1918963c9f09fae85aea57 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 19:42:29 -0400 Subject: tracing: explain why stack tracer is empty If the stack tracing is disabled (by default) the stack_trace file will only contain the header: # cat /debug/tracing/stack_trace Depth Size Location (0 entries) ----- ---- -------- This can be frustrating to a developer that does not realize that the stack tracer is disabled. This patch adds the following text: # cat /debug/tracing/stack_trace Depth Size Location (0 entries) ----- ---- -------- # # Stack tracer disabled # # To enable the stack tracer, either add 'stacktrace' to the # kernel command line # or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled' # Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4564fd94b0cd..91ccbf396c9a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -245,6 +245,17 @@ static int trace_lookup_stack(struct seq_file *m, long i) #endif } +static void print_disabled(struct seq_file *m) +{ + seq_puts(m, "#\n" + "# Stack tracer disabled\n" + "#\n" + "# To enable the stack tracer, either add 'stacktrace' to the\n" + "# kernel command line\n" + "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" + "#\n"); +} + static int t_show(struct seq_file *m, void *v) { long i; @@ -255,6 +266,10 @@ static int t_show(struct seq_file *m, void *v) " (%d entries)\n" " ----- ---- --------\n", max_stack_trace.nr_entries); + + if (!stack_tracer_enabled && !max_stack_size) + print_disabled(m); + return 0; } -- cgit v1.2.3-59-g8ed1b From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Mar 2009 14:33:36 -0400 Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name array Create a 'softirq_to_name' array, which is indexed by softirq #, so that we can easily convert between the softirq index # and its name, in order to get more meaningful output messages. LKML-Reference: <20090312183336.GB3352@redhat.com> Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt --- include/linux/interrupt.h | 5 +++++ kernel/softirq.c | 9 ++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 472f11765f60..9b7e9d743476 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -258,6 +258,11 @@ enum NR_SOFTIRQS }; +/* map softirq index to softirq name. update 'softirq_to_name' in + * kernel/softirq.c when adding a new softirq. + */ +extern char *softirq_to_name[NR_SOFTIRQS]; + /* softirq mask and active fields moved to irq_cpustat_t in * asm/hardirq.h to get better cache usage. KAO */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 7571bcb71be4..9f90fdc039f4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +char *softirq_to_name[NR_SOFTIRQS] = { + "HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ", + "BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ", + "RCU_SOFTIRQ" +}; + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -209,9 +215,10 @@ restart: h->action(h); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %p" + printk(KERN_ERR "huh, entered softirq %td %s %p" "with preempt_count %08x," " exited with %08x?\n", h - softirq_vec, + softirq_to_name[h - softirq_vec], h->action, prev_count, preempt_count()); preempt_count() = prev_count; } -- cgit v1.2.3-59-g8ed1b From 39842323ceb368d2ea36ab7696aedbe296e13b61 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Mar 2009 14:36:03 -0400 Subject: tracing: tracepoints for softirq entry/exit - tracepoints Introduce softirq entry/exit tracepoints. These are useful for augmenting existing tracers, and to figure out softirq frequencies and timings. [ s/irq_softirq_/softirq_/ for trace point names and Fixed printf format in TRACE_FORMAT macro - Steven Rostedt ] LKML-Reference: <20090312183603.GC3352@redhat.com> Signed-off-by: Jason Baron Signed-off-by: Steven Rostedt --- include/trace/irq_event_types.h | 12 ++++++++++++ kernel/softirq.c | 7 ++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h index 214bb928fe9e..85964ebd47ec 100644 --- a/include/trace/irq_event_types.h +++ b/include/trace/irq_event_types.h @@ -40,4 +40,16 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + #undef TRACE_SYSTEM diff --git a/kernel/softirq.c b/kernel/softirq.c index 9f90fdc039f4..a5e81231ca7a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* @@ -186,6 +187,9 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 +DEFINE_TRACE(softirq_entry); +DEFINE_TRACE(softirq_exit); + asmlinkage void __do_softirq(void) { struct softirq_action *h; @@ -212,8 +216,9 @@ restart: if (pending & 1) { int prev_count = preempt_count(); + trace_softirq_entry(h, softirq_vec); h->action(h); - + trace_softirq_exit(h, softirq_vec); if (unlikely(prev_count != preempt_count())) { printk(KERN_ERR "huh, entered softirq %td %s %p" "with preempt_count %08x," -- cgit v1.2.3-59-g8ed1b From 889a6c367283709a80dad9413488472596a1a1d2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 13 Mar 2009 09:03:04 +0900 Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer fix commit c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 "Don't use tracing_record_cmdline() in workqueue tracer" has a race window. find_task_by_vpid() requires task_list_lock(). LKML-Reference: <20090313090042.43CD.A69D9226@jp.fujitsu.com> Signed-off-by: KOSAKI Motohiro Signed-off-by: Steven Rostedt --- kernel/trace/trace_workqueue.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index fb5ccac8bbc0..9ab035b58cf1 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -193,12 +193,20 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct cpu_workqueue_stats *cws = p; unsigned long flags; int cpu = cws->cpu; - struct task_struct *tsk = find_task_by_vpid(cws->pid); - - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), - cws->executed, - tsk ? tsk->comm : "<...>"); + struct pid *pid; + struct task_struct *tsk; + + pid = find_get_pid(cws->pid); + if (pid) { + tsk = get_pid_task(pid, PIDTYPE_PID); + if (tsk) { + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), cws->executed, + tsk->comm); + put_task_struct(tsk); + } + put_pid(pid); + } spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); if (&cws->list == workqueue_cpu_stat(cpu)->list.next) -- cgit v1.2.3-59-g8ed1b From f28e55765e40450c127e44d00ae65d0cd1a4efec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 22:00:19 -0400 Subject: tracing: show event name in trace for TRACE_EVENT created events Unlike TRACE_FORMAT() macros, the TRACE_EVENT() macros do not show the event name in the trace file. Knowing the event type in the trace output is very useful. Instead of: task swapper:0 [140] ==> ntpd:3308 [120] We now have: sched_switch: task swapper:0 [140] ==> ntpd:3308 [120] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_stage_2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index ca347afd6aa0..5117c43f5c67 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -57,7 +57,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ \ field = (typeof(field))entry; \ \ - ret = trace_seq_printf(s, print); \ + ret = trace_seq_printf(s, #call ": " print); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ -- cgit v1.2.3-59-g8ed1b From 5cc985488845ec7227a2c5cfd2fd62cf57fb411a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Mar 2009 22:24:17 -0400 Subject: ring-buffer: document reader page design In a private email conversation I explained how the ring buffer page worked by using silly ASCII art. Ingo suggested that I add that to the comments of the code. Here it is. Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2c36be9fac2e..58128ad2fde0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,74 @@ #include "trace.h" +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + /* * A fast way to enable or disable all ring buffers is to * call tracing_on or tracing_off. Turning off the ring buffers -- cgit v1.2.3-59-g8ed1b -- cgit v1.2.3-59-g8ed1b From eb1871f34358024acfa3523ef375ef14b7527173 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Mar 2009 00:00:58 -0400 Subject: tracing: left align location header in stack_trace Ingo Molnar suggested, instead of: Depth Size Location (27 entries) ----- ---- -------- 0) 2880 48 lock_timer_base+0x2b/0x4f 1) 2832 80 __mod_timer+0x33/0xe0 2) 2752 16 __ide_set_handler+0x63/0x65 To have it be: Depth Size Location (27 entries) ----- ---- -------- 0) 2880 48 lock_timer_base+0x2b/0x4f 1) 2832 80 __mod_timer+0x33/0xe0 2) 2752 16 __ide_set_handler+0x63/0x65 Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 91ccbf396c9a..c750f65f9661 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -262,9 +262,9 @@ static int t_show(struct seq_file *m, void *v) int size; if (v == SEQ_START_TOKEN) { - seq_printf(m, " Depth Size Location" + seq_printf(m, " Depth Size Location" " (%d entries)\n" - " ----- ---- --------\n", + " ----- ---- --------\n", max_stack_trace.nr_entries); if (!stack_tracer_enabled && !max_stack_size) -- cgit v1.2.3-59-g8ed1b From bdc067582b8b71c7771bab076bbc51569c594fb4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Mar 2009 00:12:52 -0400 Subject: tracing: add comment for use of double __builtin_consant_p Impact: documentation The use of the double __builtin_contant_p checks in the event_trace_printk can be confusing to developers and reviewers. This patch adds a comment to explain why it is there. Requested-by: KOSAKI Motohiro LKML-Reference: <20090313122235.43EB.A69D9226@jp.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 35cfa7bbaf38..67595b8f0f15 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,6 +776,11 @@ extern struct ftrace_event_call __stop_ftrace_events[]; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ #define event_trace_printk(ip, fmt, args...) \ do { \ __trace_printk_check_format(fmt, ##args); \ -- cgit v1.2.3-59-g8ed1b From c69fc56de1df5769f2ec69c915c7ad5afe63804c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:46 +1030 Subject: cpumask: use topology_core_cpumask/topology_thread_cpumask instead of cpu_core_map/cpu_sibling_map Impact: cleanup This is presumably what those definitions are for, and while all archs define cpu_core_map/cpu_sibling map, that's changing (eg. x86 wants to change it to a pointer). Signed-off-by: Rusty Russell --- block/blk.h | 2 +- kernel/sched.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk.h b/block/blk.h index 0dce92c37496..3ee94358b43d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -102,7 +102,7 @@ static inline int blk_cpu_to_group(int cpu) const struct cpumask *mask = cpu_coregroup_mask(cpu); return cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(per_cpu(cpu_sibling_map, cpu)); + return cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif diff --git a/kernel/sched.c b/kernel/sched.c index 0a76d0b6f215..5dabd80c3c15 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7249,7 +7249,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, { int group; - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); if (sg) *sg = &per_cpu(sched_group_core, group).sg; @@ -7278,7 +7278,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); #else group = cpu; @@ -7621,7 +7621,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); @@ -7632,7 +7632,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { cpumask_and(this_sibling_map, - &per_cpu(cpu_sibling_map, i), cpu_map); + topology_thread_cpumask(i), cpu_map); if (i != cpumask_first(this_sibling_map)) continue; -- cgit v1.2.3-59-g8ed1b From a70f730282019f487aa33a84e5ac9a5e89c5abd0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:46 +1030 Subject: cpumask: replace node_to_cpumask with cpumask_of_node. Impact: cleanup node_to_cpumask (and the blecherous node_to_cpumask_ptr which contained a declaration) are replaced now everyone implements cpumask_of_node. Signed-off-by: Rusty Russell --- drivers/base/node.c | 2 +- drivers/pci/pci-driver.c | 3 +-- include/linux/topology.h | 6 +----- mm/page_alloc.c | 6 +++--- mm/quicklist.c | 2 +- mm/slab.c | 2 +- mm/vmscan.c | 6 ++++-- net/sunrpc/svc.c | 3 +-- 8 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index f8f578a71b25..40b809742a1c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -24,7 +24,7 @@ static struct sysdev_class node_class = { static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf) { struct node *node_dev = to_node(dev); - node_to_cpumask_ptr(mask, node_dev->sysdev.id); + const struct cpumask *mask = cpumask_of_node(node_dev->sysdev.id); int len; /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */ diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 93eac1423585..b522f883d674 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -212,10 +212,9 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, node = dev_to_node(&dev->dev); if (node >= 0) { int cpu; - node_to_cpumask_ptr(nodecpumask, node); get_online_cpus(); - cpu = cpumask_any_and(nodecpumask, cpu_online_mask); + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else diff --git a/include/linux/topology.h b/include/linux/topology.h index a16b9e06f2e5..16b7d6896ce9 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -38,11 +38,7 @@ #endif #ifndef nr_cpus_node -#define nr_cpus_node(node) \ - ({ \ - node_to_cpumask_ptr(__tmp__, node); \ - cpus_weight(*__tmp__); \ - }) +#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c44ed49ca93..a92b0975b9a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2134,7 +2134,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) int n, val; int min_val = INT_MAX; int best_node = -1; - node_to_cpumask_ptr(tmp, 0); + const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -2155,8 +2155,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) val += (n < node); /* Give preference to headless and unused nodes */ - node_to_cpumask_ptr_next(tmp, n); - if (!cpus_empty(*tmp)) + tmp = cpumask_of_node(n); + if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ diff --git a/mm/quicklist.c b/mm/quicklist.c index 8dbb6805ef35..e66d07d1b4ff 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - node_to_cpumask_ptr(cpumask_on_node, node); + const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA diff --git a/mm/slab.c b/mm/slab.c index 4d00855629c4..2daaca0b4541 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1160,7 +1160,7 @@ static void __cpuinit cpuup_canceled(long cpu) struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; int node = cpu_to_node(cpu); - node_to_cpumask_ptr(mask, node); + const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { struct array_cache *nc; diff --git a/mm/vmscan.c b/mm/vmscan.c index 6177e3bcd66b..cc6135586b44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1963,7 +1963,7 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; - node_to_cpumask_ptr(cpumask, pgdat->node_id); + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); @@ -2198,7 +2198,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb, if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { for_each_node_state(nid, N_HIGH_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); - node_to_cpumask_ptr(mask, pgdat->node_id); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index c51fed4d1af1..3bdd5bffaca8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -317,8 +317,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) } case SVC_POOL_PERNODE: { - node_to_cpumask_ptr(nodecpumask, node); - set_cpus_allowed_ptr(task, nodecpumask); + set_cpus_allowed_ptr(task, cpumask_of_node(node)); break; } } -- cgit v1.2.3-59-g8ed1b From 101aaca1f32d9923331ddc63a7a72b3a7f934c02 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:47 +1030 Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL.: x86 Impact: cleanup (Thanks to Al Viro for reminding me of this, via Ingo) CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so: #define CPU_MASK_ALL (cpumask_t) { { ... } } Taking the address of such a temporary is questionable at best, unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added CPU_MASK_ALL_PTR: #define CPU_MASK_ALL_PTR (&CPU_MASK_ALL) Which formalizes this practice. One day gcc could bite us over this usage (though we seem to have gotten away with it so far). So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR with the modern "cpu_all_mask" (a real const struct cpumask *), and remove CPU_MASK_ALL_PTR altogether. Signed-off-by: Rusty Russell Acked-by: Ingo Molnar Reported-by: Al Viro Cc: Mike Travis --- arch/x86/kernel/apic/es7000_32.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 19588f2770ee..1322f5409e20 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -457,7 +457,7 @@ static int es7000_apic_id_registered(void) static const cpumask_t *target_cpus_cluster(void) { - return &CPU_MASK_ALL; + return cpu_all_mask; } static const cpumask_t *es7000_target_cpus(void) diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ba2fc6465534..e4ce98af8c74 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -336,7 +336,7 @@ static inline void numaq_smp_callin_clear_local_apic(void) static inline const cpumask_t *numaq_target_cpus(void) { - return &CPU_MASK_ALL; + return cpu_all_mask; } static inline unsigned long -- cgit v1.2.3-59-g8ed1b From cb3d560f36c1e4aa3c26a1d79e9b6e62ab69896c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:47 +1030 Subject: cpumask: remove the now-obsoleted pcibus_to_cpumask(): x86 Impact: reduce stack usage for large NR_CPUS cpumask_of_pcibus() is the new version. Signed-off-by: Rusty Russell --- arch/x86/include/asm/pci.h | 5 ----- arch/x86/include/asm/topology.h | 1 - 2 files changed, 6 deletions(-) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index a977de23cb4d..93c8dc318111 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -109,11 +109,6 @@ static inline int __pcibus_to_node(const struct pci_bus *bus) return sd->node; } -static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus) -{ - return node_to_cpumask(__pcibus_to_node(bus)); -} - static inline const struct cpumask * cpumask_of_pcibus(const struct pci_bus *bus) { diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 77cfb2cfb386..d772facb263e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -143,7 +143,6 @@ extern void setup_node_to_cpumask_map(void); #define parent_node(node) (node) #define pcibus_to_node(bus) __pcibus_to_node(bus) -#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus) #ifdef CONFIG_X86_32 extern unsigned long node_start_pfn[]; -- cgit v1.2.3-59-g8ed1b From 23c5c9c66263311de1295b42382e5bc1e7c36c47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:48 +1030 Subject: cpumask: remove cpu_coregroup_map: x86 Impact: cleanup cpu_coregroup_mask is the New Hotness. Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 1 - arch/x86/kernel/smpboot.c | 5 ----- 2 files changed, 6 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index d772facb263e..6b954fbd7872 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -244,7 +244,6 @@ static inline int node_to_first_cpu(int node) } #endif -extern cpumask_t cpu_coregroup_map(int cpu); extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ef7d10170c30..f534257d4b46 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -428,11 +428,6 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &c->llc_shared_map; } -cpumask_t cpu_coregroup_map(int cpu) -{ - return *cpu_coregroup_mask(cpu); -} - static void impress_friends(void) { int cpu; -- cgit v1.2.3-59-g8ed1b From d3d2e7f24384cccedd29a0582ad4b014ac646abc Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:48 +1030 Subject: cpumask: remove obsolete topology_core_siblings and topology_thread_siblings: x86 Impact: cleanup There were replaced by topology_core_cpumask and topology_thread_cpumask. Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 6b954fbd7872..f7c20d031422 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -249,8 +249,6 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) -#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu)) -#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) #define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) -- cgit v1.2.3-59-g8ed1b From bc9b83dd1f66402b870301c3c7117b9c1484abb4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:49 +1030 Subject: cpumask: convert c1e_mask in arch/x86/kernel/process.c to cpumask_var_t. Impact: reduce kernel size when CONFIG_CPUMASK_OFFSTACK=y Simple conversion. Signed-off-by: Rusty Russell --- arch/x86/kernel/process.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6afa5232dbb7..cad5431951aa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -474,12 +474,12 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } -static cpumask_t c1e_mask = CPU_MASK_NONE; +static cpumask_var_t c1e_mask; static int c1e_detected; void c1e_remove_cpu(int cpu) { - cpu_clear(cpu, c1e_mask); + cpumask_clear_cpu(cpu, c1e_mask); } /* @@ -508,8 +508,8 @@ static void c1e_idle(void) if (c1e_detected) { int cpu = smp_processor_id(); - if (!cpu_isset(cpu, c1e_mask)) { - cpu_set(cpu, c1e_mask); + if (!cpumask_test_cpu(cpu, c1e_mask)) { + cpumask_set_cpu(cpu, c1e_mask); /* * Force broadcast so ACPI can not interfere. Needs * to run with interrupts enabled as it uses @@ -556,6 +556,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) pm_idle = mwait_idle; } else if (check_c1e_idle(c)) { printk(KERN_INFO "using C1E aware idle routine\n"); + alloc_cpumask_var(&c1e_mask, GFP_KERNEL); + cpumask_clear(c1e_mask); pm_idle = c1e_idle; } else pm_idle = default_idle; -- cgit v1.2.3-59-g8ed1b From fcef8576d8a64fc603e719c97d423f9f6d4e0e8b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:49 +1030 Subject: cpumask: convert arch/x86/kernel/nmi.c's backtrace_mask to a cpumask_var_t Impact: cleanup, reduce memory usage for CONFIG_CPUMASK_OFFSTACK=y I *think* every path calls check_nmi_watchdog before using the watchdog, so that's the right place for the initialization. If that's wrong, we'll get a nice NULL-deref with CONFIG_CPUMASK_OFFSTACK=y, and have uncovered another bug. Signed-off-by: Rusty Russell --- arch/x86/kernel/apic/nmi.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index bdfad80c3cf1..d6bd62407152 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_t backtrace_mask = CPU_MASK_NONE; +static cpumask_var_t backtrace_mask; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,6 +138,7 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; + alloc_cpumask_var(&backtrace_mask, GFP_KERNEL); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -413,14 +414,14 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) touched = 1; } - if (cpu_isset(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); dump_stack(); spin_unlock(&lock); - cpu_clear(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, backtrace_mask); } /* Could check oops_in_progress here too, but it's safer not to */ @@ -554,10 +555,10 @@ void __trigger_all_cpu_backtrace(void) { int i; - backtrace_mask = cpu_online_map; + cpumask_copy(backtrace_mask, cpu_online_mask); /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpus_empty(backtrace_mask)) + if (cpumask_empty(backtrace_mask)) break; mdelay(1); } -- cgit v1.2.3-59-g8ed1b From 7ad728f98162cb1af06a85b2a5fc422dddd4fb78 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:50 +1030 Subject: cpumask: x86: convert cpu_sibling_map/cpu_core_map to cpumask_var_t Impact: reduce per-cpu size for CONFIG_CPUMASK_OFFSTACK=y In most places it's cleaner to use the accessors cpu_sibling_mask() and cpu_core_mask() wrappers which already exist. I couldn't avoid cleaning up the access in oprofile, either. Signed-off-by: Rusty Russell --- arch/x86/include/asm/smp.h | 8 ++++---- arch/x86/include/asm/topology.h | 6 +++--- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 13 ++++++++----- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 6 +++--- arch/x86/kernel/cpu/proc.c | 2 +- arch/x86/kernel/smpboot.c | 12 ++++++++++-- arch/x86/oprofile/op_model_p4.c | 2 +- 9 files changed, 32 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 47d0e21f2b9e..cfb10f1667fe 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -21,19 +21,19 @@ extern int smp_num_siblings; extern unsigned int num_processors; -DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); -DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); DECLARE_PER_CPU(int, cpu_number); static inline struct cpumask *cpu_sibling_mask(int cpu) { - return &per_cpu(cpu_sibling_map, cpu); + return per_cpu(cpu_sibling_map, cpu); } static inline struct cpumask *cpu_core_mask(int cpu) { - return &per_cpu(cpu_core_map, cpu); + return per_cpu(cpu_core_map, cpu); } DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f7c20d031422..fa4aa42e976d 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -249,8 +249,8 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #ifdef ENABLE_TOPO_DEFINES #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) -#define topology_core_cpumask(cpu) (&per_cpu(cpu_core_map, cpu)) -#define topology_thread_cpumask(cpu) (&per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) +#define topology_thread_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) /* indicates that pointers to the topology cpumask_t maps are valid */ #define arch_provides_topology_pointers yes @@ -264,7 +264,7 @@ struct pci_bus; void set_pci_bus_resources_arch_default(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (cpus_weight(per_cpu(cpu_core_map, 0)) != nr_cpu_ids) +#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) #define smt_capable() (smp_num_siblings > 1) #endif diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 3178c3acd97e..d8341d17c189 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -203,7 +203,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) unsigned int i; #ifdef CONFIG_SMP - cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); + cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); #endif /* Errata workaround */ diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6428aa17b40e..e8fd76f98883 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -56,7 +56,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); static int cpu_family = CPU_OPTERON; #ifndef CONFIG_SMP -DEFINE_PER_CPU(cpumask_t, cpu_core_map); +static inline const struct cpumask *cpu_core_mask(int cpu) +{ + return cpumask_of(0); +} #endif /* Return a frequency in MHz, given an input fid */ @@ -654,7 +657,7 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); data->powernow_table = powernow_table; - if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) + if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); for (j = 0; j < data->numps; j++) @@ -808,7 +811,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* fill in data */ data->numps = data->acpi_data.state_count; - if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) + if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -1224,7 +1227,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) if (cpu_family == CPU_HW_PSTATE) cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); else - cpumask_copy(pol->cpus, &per_cpu(cpu_core_map, pol->cpu)); + cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); data->available_cores = pol->cpus; if (cpu_family == CPU_HW_PSTATE) @@ -1286,7 +1289,7 @@ static unsigned int powernowk8_get (unsigned int cpu) unsigned int khz = 0; unsigned int first; - first = first_cpu(per_cpu(cpu_core_map, cpu)); + first = cpumask_first(cpu_core_mask(cpu)); data = per_cpu(powernow_data, first); if (!data) diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index dedc1e98f168..1f0ec83d343b 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -322,7 +322,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) /* only run on CPU to be set, or on its sibling */ #ifdef CONFIG_SMP - cpumask_copy(policy->cpus, &per_cpu(cpu_sibling_map, policy->cpu)); + cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); #endif cpus_allowed = current->cpus_allowed; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index c5a32f92d07e..1f429ee3477d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -477,7 +477,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(&per_cpu(cpu_core_map, cpu)); + i = cpumask_first(cpu_core_mask(cpu)); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -497,7 +497,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu)); + cpumask_copy(b->cpus, cpu_core_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; goto out; } @@ -521,7 +521,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu)); + cpumask_copy(b->cpus, cpu_core_mask(cpu)); #endif per_cpu(threshold_banks, cpu)[bank] = b; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d67e0e48bc2d..4dd610e226e0 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -14,7 +14,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, if (c->x86_max_cores * smp_num_siblings > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", - cpus_weight(per_cpu(cpu_core_map, cpu))); + cpumask_weight(cpu_sibling_mask(cpu))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); seq_printf(m, "apicid\t\t: %d\n", c->apicid); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f534257d4b46..7f051c170add 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -101,11 +101,11 @@ EXPORT_SYMBOL(smp_num_siblings); DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; /* representing HT siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ -DEFINE_PER_CPU(cpumask_t, cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* Per CPU bogomips and other parameters */ @@ -1026,6 +1026,8 @@ static void __init smp_cpu_index_default(void) */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { + unsigned int i; + preempt_disable(); smp_cpu_index_default(); current_cpu_data = boot_cpu_data; @@ -1039,6 +1041,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) boot_cpu_logical_apicid = logical_smp_processor_id(); #endif current_thread_info()->cpu = 0; /* needed? */ + for_each_possible_cpu(i) { + alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + cpumask_clear(per_cpu(cpu_core_map, i)); + cpumask_clear(per_cpu(cpu_sibling_map, i)); + } set_cpu_sibling_map(0); enable_IR_x2apic(); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 4c4a51c90bc2..819b131fd752 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -380,7 +380,7 @@ static unsigned int get_stagger(void) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); + return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); #endif return 0; } -- cgit v1.2.3-59-g8ed1b From 996867d0965775dfa62447d0bddb5dc6818a7892 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:51 +1030 Subject: cpumask: convert arch/x86/kernel/cpu/mcheck/mce_64.c Impact: reduce kernel memory usage when CONFIG_CPUMASK_OFFSTACK=y Simple conversion of mce_device_initialized to cpumask_var_t. We don't check the alloc_cpumask_var() return since it's boot-time only, and the misc_register() in that same function isn't checked. Signed-off-by: Rusty Russell --- arch/x86/kernel/cpu/mcheck/mce_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index ca14604611ec..863f89568b1a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -990,7 +990,7 @@ static struct sysdev_attribute *mce_attributes[] = { NULL }; -static cpumask_t mce_device_initialized = CPU_MASK_NONE; +static cpumask_var_t mce_device_initialized; /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ static __cpuinit int mce_create_device(unsigned int cpu) @@ -1021,7 +1021,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (err) goto error2; } - cpu_set(cpu, mce_device_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); return 0; error2: @@ -1043,7 +1043,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) { int i; - if (!cpu_isset(cpu, mce_device_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; for (i = 0; mce_attributes[i]; i++) @@ -1053,7 +1053,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce, cpu), &bank_attrs[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); - cpu_clear(cpu, mce_device_initialized); + cpumask_clear_cpu(cpu, mce_device_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -1162,6 +1162,8 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; + alloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + err = mce_init_banks(); if (err) return err; -- cgit v1.2.3-59-g8ed1b From b643decad6c80b6886a01a8c2229be6b7951ff7b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:51 +1030 Subject: x86: arch_send_call_function_ipi_mask Impact: implement new API We define arch_send_call_function_ipi_mask and generic kernel/smp.c code creates arch_send_call_function_ipi() as a wrapper. Signed-off-by: Rusty Russell --- arch/x86/include/asm/smp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index cfb10f1667fe..19e0d88b966d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -121,9 +121,10 @@ static inline void arch_send_call_function_single_ipi(int cpu) smp_ops.send_call_func_single_ipi(cpu); } -static inline void arch_send_call_function_ipi(cpumask_t mask) +#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask +static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) { - smp_ops.send_call_func_ipi(&mask); + smp_ops.send_call_func_ipi(mask); } void cpu_disable_common(void); -- cgit v1.2.3-59-g8ed1b From b9c4398ed43a7ed023e091610c23ba7412aec2a8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:52 +1030 Subject: cpumask: remove x86's node_to_cpumask now everyone uses cpumask_of_node Impact: cleanup Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 41 ----------------------------------------- arch/x86/mm/numa_64.c | 26 -------------------------- 2 files changed, 67 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index fa4aa42e976d..216a960d4f10 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -57,17 +57,6 @@ static inline int cpu_to_node(int cpu) } #define early_cpu_to_node(cpu) cpu_to_node(cpu) -/* Returns a bitmask of CPUs on Node 'node'. - * - * Side note: this function creates the returned cpumask on the stack - * so with a high NR_CPUS count, excessive stack space is used. The - * cpumask_of_node function should be used whenever possible. - */ -static inline cpumask_t node_to_cpumask(int node) -{ - return node_to_cpumask_map[node]; -} - /* Returns a bitmask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) { @@ -92,7 +81,6 @@ DECLARE_PER_CPU(int, node_number); extern int cpu_to_node(int cpu); extern int early_cpu_to_node(int cpu); extern const cpumask_t *cpumask_of_node(int node); -extern cpumask_t node_to_cpumask(int node); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ @@ -114,26 +102,10 @@ static inline const cpumask_t *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } -/* Returns a bitmask of CPUs on Node 'node'. */ -static inline cpumask_t node_to_cpumask(int node) -{ - return node_to_cpumask_map[node]; -} - #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ extern void setup_node_to_cpumask_map(void); -/* - * Replace default node_to_cpumask_ptr with optimized version - * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" - */ -#define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = cpumask_of_node(node) - -#define node_to_cpumask_ptr_next(v, node) \ - v = cpumask_of_node(node) - #endif /* CONFIG_X86_64 */ /* @@ -212,10 +184,6 @@ static inline const cpumask_t *cpumask_of_node(int node) { return &cpu_online_map; } -static inline cpumask_t node_to_cpumask(int node) -{ - return cpu_online_map; -} static inline int node_to_first_cpu(int node) { return first_cpu(cpu_online_map); @@ -223,15 +191,6 @@ static inline int node_to_first_cpu(int node) static inline void setup_node_to_cpumask_map(void) { } -/* - * Replace default node_to_cpumask_ptr with optimized version - * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)" - */ -#define node_to_cpumask_ptr(v, node) \ - const cpumask_t *v = cpumask_of_node(node) - -#define node_to_cpumask_ptr_next(v, node) \ - v = cpumask_of_node(node) #endif #include diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 64c9cf043cdd..48bf396b6e79 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -826,32 +826,6 @@ const cpumask_t *cpumask_of_node(int node) } EXPORT_SYMBOL(cpumask_of_node); -/* - * Returns a bitmask of CPUs on Node 'node'. - * - * Side note: this function creates the returned cpumask on the stack - * so with a high NR_CPUS count, excessive stack space is used. The - * node_to_cpumask_ptr function should be used whenever possible. - */ -cpumask_t node_to_cpumask(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "node_to_cpumask(%d): no node_to_cpumask_map!\n", node); - dump_stack(); - return cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "node_to_cpumask(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return cpu_mask_none; - } - return node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(node_to_cpumask); - /* * --------- end of debug versions of the numa functions --------- */ -- cgit v1.2.3-59-g8ed1b From 71ee73e72228775a076a502b3c92028fa59e2889 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:52 +1030 Subject: x86: unify 32 and 64-bit node_to_cpumask_map Impact: cleanup We take the 64-bit code and use it on 32-bit as well. The new file is called mm/numa.c. In a minor cleanup, we use cpu_none_mask instead of declaring a local cpu_mask_none. Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 30 +++++++---------- arch/x86/kernel/smpboot.c | 5 --- arch/x86/mm/Makefile | 2 +- arch/x86/mm/numa.c | 71 +++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/numa_64.c | 69 --------------------------------------- 5 files changed, 83 insertions(+), 94 deletions(-) create mode 100644 arch/x86/mm/numa.c diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 216a960d4f10..dc31d929da04 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -44,9 +44,6 @@ #ifdef CONFIG_X86_32 -/* Mappings between node number and cpus on that node. */ -extern cpumask_t node_to_cpumask_map[]; - /* Mappings between logical cpu number and node number */ extern int cpu_to_node_map[]; @@ -57,19 +54,8 @@ static inline int cpu_to_node(int cpu) } #define early_cpu_to_node(cpu) cpu_to_node(cpu) -/* Returns a bitmask of CPUs on Node 'node'. */ -static inline const struct cpumask *cpumask_of_node(int node) -{ - return &node_to_cpumask_map[node]; -} - -static inline void setup_node_to_cpumask_map(void) { } - #else /* CONFIG_X86_64 */ -/* Mappings between node number and cpus on that node. */ -extern cpumask_t *node_to_cpumask_map; - /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); @@ -80,7 +66,6 @@ DECLARE_PER_CPU(int, node_number); #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern int cpu_to_node(int cpu); extern int early_cpu_to_node(int cpu); -extern const cpumask_t *cpumask_of_node(int node); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ @@ -96,18 +81,25 @@ static inline int early_cpu_to_node(int cpu) return early_per_cpu(x86_cpu_to_node_map, cpu); } +#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ + +#endif /* CONFIG_X86_64 */ + +/* Mappings between node number and cpus on that node. */ +extern cpumask_t *node_to_cpumask_map; + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +extern const cpumask_t *cpumask_of_node(int node); +#else /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ static inline const cpumask_t *cpumask_of_node(int node) { return &node_to_cpumask_map[node]; } - -#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ +#endif extern void setup_node_to_cpumask_map(void); -#endif /* CONFIG_X86_64 */ - /* * Returns the number of the node containing Node 'node'. This * architecture is flat, so it is a pretty simple function! diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7f051c170add..c55639b1ab8a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -115,11 +115,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) - -/* which logical CPUs are on which nodes */ -cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly = - { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -EXPORT_SYMBOL(node_to_cpumask_map); /* which node each logical CPU is on */ int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_to_node_map); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 08537747cb58..fdd30d08ab52 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -obj-$(CONFIG_NUMA) += numa_$(BITS).o +obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o obj-$(CONFIG_K8_NUMA) += k8topology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c new file mode 100644 index 000000000000..f3a19e939e80 --- /dev/null +++ b/arch/x86/mm/numa.c @@ -0,0 +1,71 @@ +/* Common code for 32 and 64-bit NUMA */ +#include +#include +#include + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +# define DBG(x...) printk(KERN_DEBUG x) +#else +# define DBG(x...) +#endif + +/* + * Which logical CPUs are on which nodes + */ +cpumask_t *node_to_cpumask_map; +EXPORT_SYMBOL(node_to_cpumask_map); + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: node_to_cpumask() is not valid until after this is done. + * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) + */ +void __init setup_node_to_cpumask_map(void) +{ + unsigned int node, num = 0; + cpumask_t *map; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) { + for_each_node_mask(node, node_possible_map) + num = node; + nr_node_ids = num + 1; + } + + /* allocate the map */ + map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); + DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + + pr_debug("Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); + + /* node_to_cpumask() will now work */ + node_to_cpumask_map = map; +} + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * Returns a pointer to the bitmask of CPUs on Node 'node'. + */ +const cpumask_t *cpumask_of_node(int node) +{ + if (node_to_cpumask_map == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } + if (node >= nr_node_ids) { + printk(KERN_WARNING + "cpumask_of_node(%d): node > nr_node_ids(%d)\n", + node, nr_node_ids); + dump_stack(); + return cpu_none_mask; + } + return &node_to_cpumask_map[node]; +} +EXPORT_SYMBOL(cpumask_of_node); +#endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 48bf396b6e79..eee149078862 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,12 +20,6 @@ #include #include -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) -#else -# define DBG(x...) -#endif - struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -48,12 +42,6 @@ EXPORT_PER_CPU_SYMBOL(node_number); DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); -/* - * Which logical CPUs are on which nodes - */ -cpumask_t *node_to_cpumask_map; -EXPORT_SYMBOL(node_to_cpumask_map); - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -661,36 +649,6 @@ void __init init_cpu_to_node(void) #endif -/* - * Allocate node_to_cpumask_map based on number of available nodes - * Requires node_possible_map to be valid. - * - * Note: node_to_cpumask() is not valid until after this is done. - * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) - */ -void __init setup_node_to_cpumask_map(void) -{ - unsigned int node, num = 0; - cpumask_t *map; - - /* setup nr_node_ids if not done yet */ - if (nr_node_ids == MAX_NUMNODES) { - for_each_node_mask(node, node_possible_map) - num = node; - nr_node_ids = num + 1; - } - - /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); - - pr_debug("Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; -} - void __cpuinit numa_set_node(int cpu, int node) { int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); @@ -799,33 +757,6 @@ int early_cpu_to_node(int cpu) return per_cpu(x86_cpu_to_node_map, cpu); } - -/* empty cpumask */ -static const cpumask_t cpu_mask_none; - -/* - * Returns a pointer to the bitmask of CPUs on Node 'node'. - */ -const cpumask_t *cpumask_of_node(int node) -{ - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "cpumask_of_node(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return (const cpumask_t *)&cpu_online_map; - } - if (node >= nr_node_ids) { - printk(KERN_WARNING - "cpumask_of_node(%d): node > nr_node_ids(%d)\n", - node, nr_node_ids); - dump_stack(); - return &cpu_mask_none; - } - return &node_to_cpumask_map[node]; -} -EXPORT_SYMBOL(cpumask_of_node); - /* * --------- end of debug versions of the numa functions --------- */ -- cgit v1.2.3-59-g8ed1b From c032ef60d1aa9af33730b7a35bbea751b131adc1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:53 +1030 Subject: cpumask: convert node_to_cpumask_map[] to cpumask_var_t Impact: reduce kernel memory usage when CONFIG_CPUMASK_OFFSTACK=y Straightforward conversion: done for 32 and 64 bit kernels. node_to_cpumask_map is now a cpumask_var_t array. 64-bit used to be a dynamic cpumask_t array, and 32-bit used to be a static cpumask_t array. Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 6 +++--- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/mm/numa.c | 28 ++++++++++++---------------- arch/x86/mm/numa_64.c | 14 +++++++------- 4 files changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index dc31d929da04..f8b833e1257f 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -86,15 +86,15 @@ static inline int early_cpu_to_node(int cpu) #endif /* CONFIG_X86_64 */ /* Mappings between node number and cpus on that node. */ -extern cpumask_t *node_to_cpumask_map; +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; #ifdef CONFIG_DEBUG_PER_CPU_MAPS extern const cpumask_t *cpumask_of_node(int node); #else /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ -static inline const cpumask_t *cpumask_of_node(int node) +static inline const struct cpumask *cpumask_of_node(int node) { - return &node_to_cpumask_map[node]; + return node_to_cpumask_map[node]; } #endif diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c55639b1ab8a..5a58a45ac1e3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -123,7 +123,7 @@ EXPORT_SYMBOL(cpu_to_node_map); static void map_cpu_to_node(int cpu, int node) { printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, &node_to_cpumask_map[node]); + cpumask_set_cpu(cpu, node_to_cpumask_map[node]); cpu_to_node_map[cpu] = node; } @@ -134,7 +134,7 @@ static void unmap_cpu_to_node(int cpu) printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, &node_to_cpumask_map[node]); + cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); cpu_to_node_map[cpu] = 0; } #else /* !(CONFIG_NUMA && CONFIG_X86_32) */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f3a19e939e80..429dc2d191fd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -12,7 +12,7 @@ /* * Which logical CPUs are on which nodes */ -cpumask_t *node_to_cpumask_map; +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); /* @@ -25,7 +25,6 @@ EXPORT_SYMBOL(node_to_cpumask_map); void __init setup_node_to_cpumask_map(void) { unsigned int node, num = 0; - cpumask_t *map; /* setup nr_node_ids if not done yet */ if (nr_node_ids == MAX_NUMNODES) { @@ -35,14 +34,11 @@ void __init setup_node_to_cpumask_map(void) } /* allocate the map */ - map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids); + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); - pr_debug("Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); - - /* node_to_cpumask() will now work */ - node_to_cpumask_map = map; + /* cpumask_of_node() will now work */ + pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); } #ifdef CONFIG_DEBUG_PER_CPU_MAPS @@ -51,13 +47,6 @@ void __init setup_node_to_cpumask_map(void) */ const cpumask_t *cpumask_of_node(int node) { - if (node_to_cpumask_map == NULL) { - printk(KERN_WARNING - "cpumask_of_node(%d): no node_to_cpumask_map!\n", - node); - dump_stack(); - return cpu_online_mask; - } if (node >= nr_node_ids) { printk(KERN_WARNING "cpumask_of_node(%d): node > nr_node_ids(%d)\n", @@ -65,6 +54,13 @@ const cpumask_t *cpumask_of_node(int node) dump_stack(); return cpu_none_mask; } + if (node_to_cpumask_map[node] == NULL) { + printk(KERN_WARNING + "cpumask_of_node(%d): no node_to_cpumask_map!\n", + node); + dump_stack(); + return cpu_online_mask; + } return &node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index eee149078862..9d2b3d2625cd 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -681,12 +681,12 @@ void __cpuinit numa_clear_node(int cpu) void __cpuinit numa_add_cpu(int cpu) { - cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); + cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } void __cpuinit numa_remove_cpu(int cpu) { - cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); + cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); } #else /* CONFIG_DEBUG_PER_CPU_MAPS */ @@ -700,17 +700,17 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) cpumask_t *mask; char buf[64]; - if (node_to_cpumask_map == NULL) { - printk(KERN_ERR "node_to_cpumask_map NULL\n"); + mask = node_to_cpumask_map[node]; + if (mask == NULL) { + printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); dump_stack(); return; } - mask = &node_to_cpumask_map[node]; if (enable) - cpu_set(cpu, *mask); + cpumask_set_cpu(cpu, mask); else - cpu_clear(cpu, *mask); + cpumask_clear_cpu(cpu, mask); cpulist_scnprintf(buf, sizeof(buf), mask); printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", -- cgit v1.2.3-59-g8ed1b From 155dd720d06a219ddf5a56b473cb3325441fc879 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:53 +1030 Subject: cpumask: convert struct cpuinfo_x86's llc_shared_map to cpumask_var_t Impact: reduce kernel memory usage when CONFIG_CPUMASK_OFFSTACK=y Signed-off-by: Rusty Russell --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/smpboot.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 76139506c3e4..d794d9483c56 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -94,7 +94,7 @@ struct cpuinfo_x86 { unsigned long loops_per_jiffy; #ifdef CONFIG_SMP /* cpus sharing the last level cache: */ - cpumask_t llc_shared_map; + cpumask_var_t llc_shared_map; #endif /* cpuid returned max cores value: */ u16 x86_max_cores; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5a58a45ac1e3..d6427aa56966 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -329,6 +329,23 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } +#ifdef CONFIG_CPUMASK_OFFSTACK +/* In this case, llc_shared_map is a pointer to a cpumask. */ +static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, + const struct cpuinfo_x86 *src) +{ + struct cpumask *llc = dst->llc_shared_map; + *dst = *src; + dst->llc_shared_map = llc; +} +#else +static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, + const struct cpuinfo_x86 *src) +{ + *dst = *src; +} +#endif /* CONFIG_CPUMASK_OFFSTACK */ + /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -338,7 +355,7 @@ void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - *c = boot_cpu_data; + copy_cpuinfo_x86(c, &boot_cpu_data); c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); @@ -362,15 +379,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_mask(i)); cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, &c->llc_shared_map); - cpumask_set_cpu(cpu, &o->llc_shared_map); + cpumask_set_cpu(i, c->llc_shared_map); + cpumask_set_cpu(cpu, o->llc_shared_map); } } } else { cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); } - cpumask_set_cpu(cpu, &c->llc_shared_map); + cpumask_set_cpu(cpu, c->llc_shared_map); if (current_cpu_data.x86_max_cores == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); @@ -381,8 +398,8 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, &c->llc_shared_map); - cpumask_set_cpu(cpu, &cpu_data(i).llc_shared_map); + cpumask_set_cpu(i, c->llc_shared_map); + cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -420,7 +437,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) if (sched_mc_power_savings || sched_smt_power_savings) return cpu_core_mask(cpu); else - return &c->llc_shared_map; + return c->llc_shared_map; } static void impress_friends(void) @@ -1039,8 +1056,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); cpumask_clear(per_cpu(cpu_core_map, i)); cpumask_clear(per_cpu(cpu_sibling_map, i)); + cpumask_clear(cpu_data(i).llc_shared_map); } set_cpu_sibling_map(0); -- cgit v1.2.3-59-g8ed1b From 3f76a183de8ad3aeb7425f3d9685bb6003abd1a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:54 +1030 Subject: x86: unify cpu_callin_mask/cpu_callout_mask/cpu_initialized_mask/cpu_sibling_setup_mask Impact: cleanup Signed-off-by: Rusty Russell --- arch/x86/include/asm/cpumask.h | 18 ------------------ arch/x86/kernel/cpu/common.c | 12 ------------ 2 files changed, 30 deletions(-) diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index a7f3c75f8ad7..61c852fa346b 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -3,8 +3,6 @@ #ifndef __ASSEMBLY__ #include -#ifdef CONFIG_X86_64 - extern cpumask_var_t cpu_callin_mask; extern cpumask_var_t cpu_callout_mask; extern cpumask_var_t cpu_initialized_mask; @@ -12,21 +10,5 @@ extern cpumask_var_t cpu_sibling_setup_mask; extern void setup_cpu_local_masks(void); -#else /* CONFIG_X86_32 */ - -extern cpumask_t cpu_callin_map; -extern cpumask_t cpu_callout_map; -extern cpumask_t cpu_initialized; -extern cpumask_t cpu_sibling_setup_map; - -#define cpu_callin_mask ((struct cpumask *)&cpu_callin_map) -#define cpu_callout_mask ((struct cpumask *)&cpu_callout_map) -#define cpu_initialized_mask ((struct cpumask *)&cpu_initialized) -#define cpu_sibling_setup_mask ((struct cpumask *)&cpu_sibling_setup_map) - -static inline void setup_cpu_local_masks(void) { } - -#endif /* CONFIG_X86_32 */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_CPUMASK_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 826d5c876278..82f6cc045ad0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -41,8 +41,6 @@ #include "cpu.h" -#ifdef CONFIG_X86_64 - /* all of these masks are initialized in setup_cpu_local_masks() */ cpumask_var_t cpu_callin_mask; cpumask_var_t cpu_callout_mask; @@ -60,16 +58,6 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -#else /* CONFIG_X86_32 */ - -cpumask_t cpu_callin_map; -cpumask_t cpu_callout_map; -cpumask_t cpu_initialized; -cpumask_t cpu_sibling_setup_map; - -#endif /* CONFIG_X86_32 */ - - static struct cpu_dev *this_cpu __cpuinitdata; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { -- cgit v1.2.3-59-g8ed1b From 4f0628963c86d2f97b8cb9acc024a7fe288a6a57 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:54 +1030 Subject: cpumask: use new cpumask functions throughout x86 Impact: cleanup 1) &cpu_online_map -> cpu_online_mask 2) first_cpu/next_cpu_nr -> cpumask_first/cpumask_next 3) cpu_*_map manipulation -> init_cpu_* / set_cpu_* Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 4 ++-- arch/x86/kernel/apic/bigsmp_32.c | 16 ++++++++-------- arch/x86/kernel/apic/es7000_32.c | 6 +++--- arch/x86/kernel/apic/summit_32.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +- arch/x86/kernel/cpu/proc.c | 4 ++-- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/smpboot.c | 11 +++++------ arch/x86/xen/smp.c | 6 +++--- 9 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f8b833e1257f..1ce1e1afa801 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -174,11 +174,11 @@ static inline int early_cpu_to_node(int cpu) static inline const cpumask_t *cpumask_of_node(int node) { - return &cpu_online_map; + return cpu_online_mask; } static inline int node_to_first_cpu(int node) { - return first_cpu(cpu_online_map); + return cpumask_first(cpu_online_mask); } static inline void setup_node_to_cpumask_map(void) { } diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d806ecaa948f..676cdac385c0 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,12 +26,12 @@ static int bigsmp_apic_id_registered(void) return 1; } -static const cpumask_t *bigsmp_target_cpus(void) +static const struct cpumask *bigsmp_target_cpus(void) { #ifdef CONFIG_SMP - return &cpu_online_map; + return cpu_online_mask; #else - return &cpumask_of_cpu(0); + return cpumask_of(0); #endif } @@ -118,9 +118,9 @@ static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) } /* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const cpumask_t *cpumask) +static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) { - return bigsmp_cpu_to_logical_apicid(first_cpu(*cpumask)); + return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); } static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, @@ -188,10 +188,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } /* NULL entry stops DMI scanning */ }; -static void bigsmp_vector_allocation_domain(int cpu, cpumask_t *retmask) +static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) { - cpus_clear(*retmask); - cpu_set(cpu, *retmask); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); } static int probe_bigsmp(void) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 1322f5409e20..26d3a3eba75b 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -460,9 +460,9 @@ static const cpumask_t *target_cpus_cluster(void) return cpu_all_mask; } -static const cpumask_t *es7000_target_cpus(void) +static const struct cpumask *es7000_target_cpus(void) { - return &cpumask_of_cpu(smp_processor_id()); + return cpumask_of(smp_processor_id()); } static unsigned long @@ -517,7 +517,7 @@ static void es7000_setup_apic_routing(void) "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", - nr_ioapics, cpus_addr(*es7000_target_cpus())[0]); + nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } static int es7000_apicid_to_node(int logical_apicid) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index aac52fa873ff..7f6bd908da46 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -192,7 +192,7 @@ static const cpumask_t *summit_target_cpus(void) * dest_LowestPrio mode logical clustered apic interrupt routing * Just start on cpu 0. IRQ balancing will spread load */ - return &cpumask_of_cpu(0); + return cpumask_of(0); } static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index aaa7d9730938..96b2a85545aa 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -249,7 +249,7 @@ void cmci_rediscover(int dying) for_each_online_cpu (cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu))) + if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) continue; /* Recheck banks in case CPUs don't all have the same */ if (cmci_supported(&banks)) diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 4dd610e226e0..f93047fed791 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -143,9 +143,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = first_cpu(cpu_online_map); + *pos = cpumask_first(cpu_online_mask); else - *pos = next_cpu_nr(*pos - 1, cpu_online_map); + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cad5431951aa..6638294cec8d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -324,7 +324,7 @@ void stop_this_cpu(void *dummy) /* * Remove this CPU: */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); disable_local_APIC(); for (;;) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d6427aa56966..58d24ef917d8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -296,7 +296,7 @@ notrace static void __cpuinit start_secondary(void *unused) __flush_tlb_all(); #endif - /* This must be done before setting cpu_online_map */ + /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); @@ -904,9 +904,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) */ static __init void disable_smp(void) { - /* use the read/write pointers to the present and possible maps */ - cpumask_copy(&cpu_present_map, cpumask_of(0)); - cpumask_copy(&cpu_possible_map, cpumask_of(0)); + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); smpboot_clear_io_apic_irqs(); if (smp_found_config) @@ -1149,11 +1148,11 @@ early_param("possible_cpus", _setup_possible_cpus); /* - * cpu_possible_map should be static, it cannot change as cpu's + * cpu_possible_mask should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures * are allocated by some modules at init time, and dont expect to * do this dynamically on cpu arrival/departure. - * cpu_present_map on the other hand can change dynamically. + * cpu_present_mask on the other hand can change dynamically. * In case when cpu_hotplug is not compiled, then we resort to current * behaviour, which is cpu_possible == cpu_present. * - Ashok Raj diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8d470562ffc9..585a6e330837 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void) rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; - cpu_set(i, cpu_possible_map); + set_cpu_possible(i, true); } } } @@ -197,7 +197,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) continue; - cpu_clear(cpu, cpu_possible_map); + set_cpu_possible(cpu, false); } for_each_possible_cpu (cpu) { @@ -210,7 +210,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); - cpu_set(cpu, cpu_present_map); + set_cpu_present(cpu, true); } } -- cgit v1.2.3-59-g8ed1b From 70ba2b6a70f0077707977e3b107478768492040d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:55 +1030 Subject: cpumask: clean up summit's send_IPI functions Impact: cleanup, remove cpumask from stack summit_send_IPI_allbutself might as well call default_send_IPI_mask_allbutself_logical(). Also change cpumask_t to struct cpumask and &cpu_online_map to cpu_online_mask while here. Signed-off-by: Rusty Russell --- arch/x86/kernel/apic/summit_32.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 7f6bd908da46..dda0a77a36f1 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -53,23 +53,19 @@ static unsigned summit_get_apic_id(unsigned long x) return (x >> 24) & 0xFF; } -static inline void summit_send_IPI_mask(const cpumask_t *mask, int vector) +static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector) { default_send_IPI_mask_sequence_logical(mask, vector); } static void summit_send_IPI_allbutself(int vector) { - cpumask_t mask = cpu_online_map; - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - summit_send_IPI_mask(&mask, vector); + default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); } static void summit_send_IPI_all(int vector) { - summit_send_IPI_mask(&cpu_online_map, vector); + summit_send_IPI_mask(cpu_online_mask, vector); } #include -- cgit v1.2.3-59-g8ed1b From d680eb8bcd0e43b20067fd2c810d76463db5444e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:56 +1030 Subject: cpumask: make Xen use the new operators. Impact: cleanup In particular, *map are deprecated, and you have to use the accessors as *mask are const. Signed-off-by: Rusty Russell To: Jeremy Fitzhardinge --- drivers/xen/cpu_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 974f56d1ebe1..5f54c01c1568 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -10,7 +10,7 @@ static void enable_hotplug_cpu(int cpu) if (!cpu_present(cpu)) arch_register_cpu(cpu); - cpu_set(cpu, cpu_present_map); + set_cpu_present(cpu, true); } static void disable_hotplug_cpu(int cpu) @@ -18,7 +18,7 @@ static void disable_hotplug_cpu(int cpu) if (cpu_present(cpu)) arch_unregister_cpu(cpu); - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); } static void vcpu_hotplug(unsigned int cpu) -- cgit v1.2.3-59-g8ed1b From 5c6cb5e2b1798694c859fd3821a34404355e1030 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:56 +1030 Subject: cpumask: remove cpumask_t assignment from vector_allocation_domain() Impact: cleanup It's not legal to do assignments into cpumask_var_t; they will soon be of variable length. So explicitly clear the mask and set the first word, rather than using assignment. Signed-off-by: Rusty Russell --- arch/x86/kernel/apic/es7000_32.c | 3 ++- arch/x86/kernel/apic/numaq_32.c | 3 ++- arch/x86/kernel/apic/probe_32.c | 3 ++- arch/x86/kernel/apic/summit_32.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 26d3a3eba75b..12c8e19ef96e 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -420,7 +420,8 @@ static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index e4ce98af8c74..9562de1b8882 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -472,7 +472,8 @@ static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } static void numaq_setup_portio_remap(void) diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 141c99a1c264..01eda2ac65e4 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -83,7 +83,8 @@ static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } }; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } /* should be called last. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index dda0a77a36f1..278e863c71b5 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -352,7 +352,8 @@ static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask) * deliver interrupts to the wrong hyperthread when only one * hyperthread was specified in the interrupt desitination. */ - *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } }; + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; } #ifdef CONFIG_X86_SUMMIT_NUMA -- cgit v1.2.3-59-g8ed1b From 76ba0ecda0de9accea9a91cb6dbde46782110e1c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:57 +1030 Subject: cpumask: use cpumask_var_t in uv_flush_tlb_others. Impact: remove cpumask_t, reduce per-cpu size for CONFIG_CPUMASK_OFFSTACK=y Signed-off-by: Rusty Russell --- arch/x86/kernel/tlb_uv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d038b9c45cf8..8afb69180c9b 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -275,6 +275,8 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade, return NULL; } +static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); + /** * uv_flush_tlb_others - globally purge translation cache of a virtual * address or all TLB's @@ -304,8 +306,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask); - struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask); + struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); int i; int bit; int blade; @@ -755,6 +756,10 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; + for_each_possible_cpu(cur_cpu) + alloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), + GFP_KERNEL, cpu_to_node(cur_cpu)); + uv_bau_retry_limit = 1; uv_nshift = uv_hub_info->n_val; uv_mmask = (1UL << uv_hub_info->n_val) - 1; -- cgit v1.2.3-59-g8ed1b From 73e907de7d5cecef43d9949ab8f4fdca508168c7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 14:49:57 +1030 Subject: cpumask: remove x86 cpumask_t uses. Impact: cleanup We are removing cpumask_t in favour of struct cpumask: mainly as a marker of what code is now CONFIG_CPUMASK_OFFSTACK-safe. The only non-trivial change here is vector_allocation_domain(): explicitly clear the mask and set the first word, rather than using assignment. Signed-off-by: Rusty Russell --- arch/x86/include/asm/topology.h | 4 ++-- arch/x86/kernel/apic/es7000_32.c | 6 +++--- arch/x86/kernel/apic/numaq_32.c | 6 +++--- arch/x86/kernel/apic/summit_32.c | 6 +++--- arch/x86/mm/numa.c | 2 +- arch/x86/mm/numa_64.c | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 1ce1e1afa801..e3f4198371a9 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -89,7 +89,7 @@ static inline int early_cpu_to_node(int cpu) extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; #ifdef CONFIG_DEBUG_PER_CPU_MAPS -extern const cpumask_t *cpumask_of_node(int node); +extern const struct cpumask *cpumask_of_node(int node); #else /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) @@ -172,7 +172,7 @@ static inline int early_cpu_to_node(int cpu) return 0; } -static inline const cpumask_t *cpumask_of_node(int node) +static inline const struct cpumask *cpumask_of_node(int node) { return cpu_online_mask; } diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 12c8e19ef96e..1c11b819f245 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -410,7 +410,7 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_vector_allocation_domain(int cpu, cpumask_t *retmask) +static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest @@ -456,7 +456,7 @@ static int es7000_apic_id_registered(void) return 1; } -static const cpumask_t *target_cpus_cluster(void) +static const struct cpumask *target_cpus_cluster(void) { return cpu_all_mask; } @@ -573,7 +573,7 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static unsigned int es7000_cpu_mask_to_apicid(const cpumask_t *cpumask) +static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) { unsigned int round = 0; int cpu, uninitialized_var(apicid); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 9562de1b8882..533e59c6fc82 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -334,7 +334,7 @@ static inline void numaq_smp_callin_clear_local_apic(void) clear_local_APIC(); } -static inline const cpumask_t *numaq_target_cpus(void) +static inline const struct cpumask *numaq_target_cpus(void) { return cpu_all_mask; } @@ -427,7 +427,7 @@ static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static inline unsigned int numaq_cpu_mask_to_apicid(const cpumask_t *cpumask) +static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) { return 0x0F; } @@ -462,7 +462,7 @@ static int probe_numaq(void) return found_numaq; } -static void numaq_vector_allocation_domain(int cpu, cpumask_t *retmask) +static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 278e863c71b5..9cfe1f415d81 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -182,7 +182,7 @@ static inline int is_WPEG(struct rio_detail *rio){ #define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER) -static const cpumask_t *summit_target_cpus(void) +static const struct cpumask *summit_target_cpus(void) { /* CPU_MASK_ALL (0xff) has undefined behaviour with * dest_LowestPrio mode logical clustered apic interrupt routing @@ -285,7 +285,7 @@ static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) return 1; } -static unsigned int summit_cpu_mask_to_apicid(const cpumask_t *cpumask) +static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) { unsigned int round = 0; int cpu, apicid = 0; @@ -342,7 +342,7 @@ static int probe_summit(void) return 0; } -static void summit_vector_allocation_domain(int cpu, cpumask_t *retmask) +static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) { /* Careful. Some cpus do not strictly honor the set of cpus * specified in the interrupt destination when using lowest diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 429dc2d191fd..ce255e32a593 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -45,7 +45,7 @@ void __init setup_node_to_cpumask_map(void) /* * Returns a pointer to the bitmask of CPUs on Node 'node'. */ -const cpumask_t *cpumask_of_node(int node) +const struct cpumask *cpumask_of_node(int node) { if (node >= nr_node_ids) { printk(KERN_WARNING diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9d2b3d2625cd..d73aaa892371 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -697,7 +697,7 @@ void __cpuinit numa_remove_cpu(int cpu) static void __cpuinit numa_set_cpumask(int cpu, int enable) { int node = early_cpu_to_node(cpu); - cpumask_t *mask; + struct cpumask *mask; char buf[64]; mask = node_to_cpumask_map[node]; -- cgit v1.2.3-59-g8ed1b From 7f96f93f02b7637491a1637dee12dcdcd40b9802 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Mar 2009 00:37:42 -0400 Subject: tracing: move binary buffers into per cpu directory The binary_buffers directory in /debugfs/tracing held the files to read the trace buffers in a binary format. This held one file per CPU buffer. But we also have a per_cpu directory that holds a way to read the pretty-print formats. This patch moves the binary buffers into the per_cpu_directory: # ls /debug/tracing/per_cpu/cpu1/ trace trace_pipe trace_pipe_raw The new name is called "trace_pipe_raw". The binary buffers always acted similar to trace_pipe, except that they produce raw data. Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dbb077d8a172..efe3202c0209 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3543,6 +3543,11 @@ static void tracing_init_debugfs_percpu(long cpu) (void *) cpu, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); + + entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); + if (!entry) + pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3826,7 +3831,6 @@ static __init void create_trace_options_dir(void) static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *buffers; struct dentry *entry; int cpu; @@ -3899,26 +3903,6 @@ static __init int tracer_init_debugfs(void) pr_warning("Could not create debugfs " "'trace_marker' entry\n"); - buffers = debugfs_create_dir("binary_buffers", d_tracer); - - if (!buffers) - pr_warning("Could not create buffers directory\n"); - else { - int cpu; - char buf[64]; - - for_each_tracing_cpu(cpu) { - sprintf(buf, "%d", cpu); - - entry = debugfs_create_file(buf, 0444, buffers, - (void *)(long)cpu, - &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs buffers " - "'%s' entry\n", buf); - } - } - #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, -- cgit v1.2.3-59-g8ed1b From 899039e8746bb9a09b6487ddb8ab2275ce9d0256 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 13 Mar 2009 00:43:33 -0400 Subject: softirq: no need to have SOFTIRQ in softirq name Impact: clean up It is redundant to have 'SOFTIRQ' in the softirq names. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/softirq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index a5e81231ca7a..65ff3e3961b4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -55,9 +55,8 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { - "HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ", - "BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ", - "RCU_SOFTIRQ" + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", + "TASKLET", "SCHED", "HRTIMER", "RCU" }; /* -- cgit v1.2.3-59-g8ed1b From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Mar 2009 05:52:59 +0100 Subject: tracing/ftrace: syscall tracing infrastructure, basics Provide basic callbacks to do syscall tracing. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com> [ simplified it to a trace_printk() for now. ] Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 21 ++++++++ kernel/trace/Kconfig | 10 ++++ kernel/trace/Makefile | 1 + kernel/trace/trace.h | 2 + kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+) create mode 100644 kernel/trace/trace_syscalls.c diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e1583f2639b0..c146c1021a29 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {} #endif /* CONFIG_HW_BRANCH_TRACER */ +/* + * A syscall entry in the ftrace syscalls array. + * + * @syscall_nr: syscall number + */ +struct syscall_trace_entry { + int syscall_nr; +}; + +#ifdef CONFIG_FTRACE_SYSCALLS +extern void start_ftrace_syscalls(void); +extern void stop_ftrace_syscalls(void); +extern void ftrace_syscall_enter(struct pt_regs *regs); +extern void ftrace_syscall_exit(struct pt_regs *regs); +#else +static inline void start_ftrace_syscalls(void) { } +static inline void stop_ftrace_syscalls(void) { } +static inline void ftrace_syscall_enter(struct pt_regs *regs) { } +static inline void ftrace_syscall_exit(struct pt_regs *regs) { } +#endif + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8e4a2a61cd75..95a0ad191f19 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool +config HAVE_FTRACE_SYSCALLS + bool + config TRACER_MAX_TRACE bool @@ -175,6 +178,13 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. +config FTRACE_SYSCALLS + bool "Trace syscalls" + depends on HAVE_FTRACE_SYSCALLS + select TRACING + help + Basic tracer to catch the syscall entry and exit events. + config BOOT_TRACER bool "Trace boot initcalls" select TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c7a2943796eb..c3feea01c3e0 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACER) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c5e1d8865fe4..3d49daae47dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -30,6 +30,8 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, + TRACE_SYSCALL_ENTER, + TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c new file mode 100644 index 000000000000..66cf97449af3 --- /dev/null +++ b/kernel/trace/trace_syscalls.c @@ -0,0 +1,113 @@ +#include +#include + +#include + +#include "trace_output.h" +#include "trace.h" + +static atomic_t refcount; + +void start_ftrace_syscalls(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (atomic_inc_return(&refcount) != 1) + goto out; + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + + read_unlock_irqrestore(&tasklist_lock, flags); +out: + atomic_dec(&refcount); +} + +void stop_ftrace_syscalls(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (atomic_dec_return(&refcount)) + goto out; + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + + read_unlock_irqrestore(&tasklist_lock, flags); +out: + atomic_inc(&refcount); +} + +void ftrace_syscall_enter(struct pt_regs *regs) +{ + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + + trace_printk("syscall %d enter\n", syscall_nr); +} + +void ftrace_syscall_exit(struct pt_regs *regs) +{ + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + + trace_printk("syscall %d exit\n", syscall_nr); +} + +static int init_syscall_tracer(struct trace_array *tr) +{ + start_ftrace_syscalls(); + + return 0; +} + +static void reset_syscall_tracer(struct trace_array *tr) +{ + stop_ftrace_syscalls(); +} + +static struct trace_event syscall_enter_event = { + .type = TRACE_SYSCALL_ENTER, +}; + +static struct trace_event syscall_exit_event = { + .type = TRACE_SYSCALL_EXIT, +}; + +static struct tracer syscall_tracer __read_mostly = { + .name = "syscall", + .init = init_syscall_tracer, + .reset = reset_syscall_tracer +}; + +__init int register_ftrace_syscalls(void) +{ + int ret; + + ret = register_ftrace_event(&syscall_enter_event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + syscall_enter_event.type); + WARN_ON_ONCE(1); + } + + ret = register_ftrace_event(&syscall_exit_event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + syscall_exit_event.type); + WARN_ON_ONCE(1); + } + + return register_tracer(&syscall_tracer); +} +device_initcall(register_ftrace_syscalls); -- cgit v1.2.3-59-g8ed1b From 1b3fa2ce64363c289b3b14723cca7290bf91cfce Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 7 Mar 2009 05:53:00 +0100 Subject: tracing/x86: basic implementation of syscall tracing for x86 Provide the x86 trace callbacks to trace syscalls. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/thread_info.h | 9 ++++++--- arch/x86/kernel/ptrace.c | 7 +++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bdcee12c25ab..b0a638b4199a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -35,6 +35,7 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE + select HAVE_FTRACE_SYSCALLS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index df9d5f78385e..8820a73ae090 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -94,6 +94,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,15 +116,17 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ + _TIF_SYSCALL_FTRACE) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -132,7 +135,7 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) +#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3d9672e59c16..99749d6e87a8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1416,6 +1417,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; + if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + ftrace_syscall_enter(regs); + if (unlikely(current->audit_context)) { if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, @@ -1439,6 +1443,9 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + ftrace_syscall_exit(regs); + if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); -- cgit v1.2.3-59-g8ed1b From b00f0b6dc1773b4c8f538503247da050b5ea631b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 13 Mar 2009 17:14:01 +0800 Subject: ftrace: avoid double-free of dyn_ftrace If dyn_ftrace is freed before ftrace_release(), ftrace_release() will free it again and make ftrace_free_records wrong. Signed-off-by: Zhao Lei Cc: "Steven Rostedt ;" LKML-Reference: <49BA23D9.1050900@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d33d306bdcf4..26c45aaf6805 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -356,7 +356,8 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) + if ((rec->ip >= s) && (rec->ip < e) && + !(rec->flags & FTRACE_FL_FREE)) ftrace_free_rec(rec); } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); -- cgit v1.2.3-59-g8ed1b From fa9d13cf135efbd454453a53b6299976bea245a9 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 13 Mar 2009 17:16:34 +0800 Subject: ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED rec Do __ftrace_replace_code for !FTRACE_FL_CONVERTED rec will always fail, we should ignore this rec. Signed-off-by: Zhao Lei Cc: "Steven Rostedt ;" LKML-Reference: <49BA2472.4060206@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 26c45aaf6805..08f4a624e31f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -532,11 +532,12 @@ static void ftrace_replace_code(int enable) do_for_each_ftrace_rec(pg, rec) { /* - * Skip over free records and records that have - * failed. + * Skip over free records, records that have + * failed and not converted. */ if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED) + rec->flags & FTRACE_FL_FAILED || + rec->flags & FTRACE_FL_CONVERTED) continue; /* ignore updates to this record's mcount site */ @@ -548,7 +549,7 @@ static void ftrace_replace_code(int enable) } failed = __ftrace_replace_code(rec, enable); - if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { + if (failed) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || !core_kernel_text(rec->ip)) { -- cgit v1.2.3-59-g8ed1b From 641cd4cfcdc71ce01535b31cc4d57d59a1fae1fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 10:47:34 +0100 Subject: generic-ipi: eliminate WARN_ON()s during oops/panic Do not output smp-call related warnings in the oops/panic codepath. Reported-by: Jan Beulich Acked-by: Peter Zijlstra LKML-Reference: <49B91A7E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- kernel/smp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 7ad2262d2eca..858baac568ee 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -285,7 +286,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, this_cpu = get_cpu(); /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,7 +330,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, csd_lock(data); /* Can deadlock when called with interrupts disabled */ - WARN_ON(wait && irqs_disabled()); + WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,7 +366,7 @@ void smp_call_function_many(const struct cpumask *mask, int cpu, next_cpu, this_cpu = smp_processor_id(); /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); -- cgit v1.2.3-59-g8ed1b From ffd71da4e3f323b7673b061e6f7e0d0c12dc2b49 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 10:54:24 +0100 Subject: panic: decrease oops_in_progress only after having done the panic Impact: eliminate secondary warnings during panic() We can panic() in a number of difficult, atomic contexts, hence we use bust_spinlocks(1) in panic() to increase oops_in_progress, which prevents various debug checks we have in place. But in practice this protection only covers the first few printk's done by panic() - it does not cover the later attempt to stop all other CPUs and kexec(). If a secondary warning triggers in one of those facilities that can make the panic message scroll off. So do bust_spinlocks(0) only much later in panic(). (which code is only reached if panic policy is relaxed that it can return after a warning message) Reported-by: Jan Beulich LKML-Reference: <49B91A7E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/panic.c b/kernel/panic.c index 32fe4eff1b89..57fb005de546 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -77,7 +77,6 @@ NORET_TYPE void panic(const char * fmt, ...) #ifdef CONFIG_DEBUG_BUGVERBOSE dump_stack(); #endif - bust_spinlocks(0); /* * If we have crashed and we have a crash kernel loaded let it handle @@ -136,6 +135,7 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } + bust_spinlocks(0); } EXPORT_SYMBOL(panic); -- cgit v1.2.3-59-g8ed1b From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:14:06 +0100 Subject: panic, smp: provide smp_send_stop() wrapper on UP too Impact: cleanup, no code changed Remove an ugly #ifdef CONFIG_SMP from panic(), by providing an smp_send_stop() wrapper on UP too. LKML-Reference: <49B91A7E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- include/linux/smp.h | 4 +++- kernel/panic.c | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 2d3bcb6b37ff..a69db820eed6 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): - */ + */ /* * stops all CPUs but the current one: @@ -122,6 +122,8 @@ extern unsigned int setup_max_cpus; #else /* !SMP */ +static inline void smp_send_stop(void) { } + /* * These macros fold the SMP functionality into a single CPU system */ diff --git a/kernel/panic.c b/kernel/panic.c index 57fb005de546..ca75e819d0ea 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); -#ifdef CONFIG_SMP /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic * situation. */ smp_send_stop(); -#endif atomic_notifier_call_chain(&panic_notifier_list, 0, buf); -- cgit v1.2.3-59-g8ed1b From c95dbf27e201587b2a6cf63f8501a0313d9b4801 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:14:06 +0100 Subject: panic: clean up kernel/panic.c Impact: cleanup, no code changed Clean up kernel/panic.c some more and make it more consistent. LKML-Reference: <49B91A7E.76E4.0078.0@novell.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 111 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index ca75e819d0ea..3fd8c5bf8b39 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -8,19 +8,19 @@ * This function is used through-out the kernel (including mm and fs) * to indicate a major problem. */ +#include +#include +#include +#include #include -#include -#include +#include #include -#include -#include +#include +#include +#include #include -#include +#include #include -#include -#include -#include -#include #include int panic_on_oops; @@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ - NORET_TYPE void panic(const char * fmt, ...) { - long i; static char buf[1024]; va_list args; -#if defined(CONFIG_S390) - unsigned long caller = (unsigned long) __builtin_return_address(0); -#endif + long i; /* - * It's possible to come here directly from a panic-assertion and not - * have preempt disabled. Some functions called from here want + * It's possible to come here directly from a panic-assertion and + * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... */ preempt_disable(); @@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...) if (panic_timeout > 0) { /* - * Delay timeout seconds before rebooting the machine. - * We can't use the "normal" timers since we just panicked.. - */ - printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); + * Delay timeout seconds before rebooting the machine. + * We can't use the "normal" timers since we just panicked. + */ + printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + for (i = 0; i < panic_timeout*1000; ) { touch_nmi_watchdog(); i += panic_blink(i); mdelay(1); i++; } - /* This will not be a clean reboot, with everything - * shutting down. But if there is a chance of - * rebooting the system it will be rebooted. + /* + * This will not be a clean reboot, with everything + * shutting down. But if there is a chance of + * rebooting the system it will be rebooted. */ emergency_restart(); } @@ -124,10 +122,15 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif #if defined(CONFIG_S390) - disabled_wait(caller); + { + unsigned long caller; + + caller = (unsigned long)__builtin_return_address(0); + disabled_wait(caller); + } #endif local_irq_enable(); - for (i = 0;;) { + for (i = 0; ; ) { touch_softlockup_watchdog(); i += panic_blink(i); mdelay(1); @@ -140,23 +143,23 @@ EXPORT_SYMBOL(panic); struct tnt { - u8 bit; - char true; - char false; + u8 bit; + char true; + char false; }; static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, }; /** @@ -193,7 +196,8 @@ const char *print_tainted(void) *s = 0; } else snprintf(buf, sizeof(buf), "Not tainted"); - return(buf); + + return buf; } int test_taint(unsigned flag) @@ -209,7 +213,8 @@ unsigned long get_taint(void) void add_taint(unsigned flag) { - debug_locks = 0; /* can't trust the integrity of the kernel anymore */ + /* can't trust the integrity of the kernel anymore: */ + debug_locks = 0; set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); @@ -264,8 +269,8 @@ static void do_oops_enter_exit(void) } /* - * Return true if the calling CPU is allowed to print oops-related info. This - * is a bit racy.. + * Return true if the calling CPU is allowed to print oops-related info. + * This is a bit racy.. */ int oops_may_print(void) { @@ -274,20 +279,22 @@ int oops_may_print(void) /* * Called when the architecture enters its oops handler, before it prints - * anything. If this is the first CPU to oops, and it's oopsing the first time - * then let it proceed. + * anything. If this is the first CPU to oops, and it's oopsing the first + * time then let it proceed. * - * This is all enabled by the pause_on_oops kernel boot option. We do all this - * to ensure that oopses don't scroll off the screen. It has the side-effect - * of preventing later-oopsing CPUs from mucking up the display, too. + * This is all enabled by the pause_on_oops kernel boot option. We do all + * this to ensure that oopses don't scroll off the screen. It has the + * side-effect of preventing later-oopsing CPUs from mucking up the display, + * too. * - * It turns out that the CPU which is allowed to print ends up pausing for the - * right duration, whereas all the other CPUs pause for twice as long: once in - * oops_enter(), once in oops_exit(). + * It turns out that the CPU which is allowed to print ends up pausing for + * the right duration, whereas all the other CPUs pause for twice as long: + * once in oops_enter(), once in oops_exit(). */ void oops_enter(void) { - debug_locks_off(); /* can't trust the integrity of the kernel anymore */ + /* can't trust the integrity of the kernel anymore: */ + debug_locks_off(); do_oops_enter_exit(); } -- cgit v1.2.3-59-g8ed1b From 850a80cfaa5aec3e626eb3736eff890a80e4fa77 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 13 Mar 2009 17:47:23 +0800 Subject: ftrace: use seq_read Impact: cleanup VFS layer has tested the file mode, we do not need test it. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <49BA2BAB.6010608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08f4a624e31f..bf78a4c75c67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1120,16 +1120,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) return ftrace_regex_open(inode, file, 0); } -static ssize_t -ftrace_regex_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - if (file->f_mode & FMODE_READ) - return seq_read(file, ubuf, cnt, ppos); - else - return -EPERM; -} - static loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int origin) { @@ -1882,7 +1872,7 @@ static const struct file_operations ftrace_failures_fops = { static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, - .read = ftrace_regex_read, + .read = seq_read, .write = ftrace_filter_write, .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, @@ -1890,7 +1880,7 @@ static const struct file_operations ftrace_filter_fops = { static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, - .read = ftrace_regex_read, + .read = seq_read, .write = ftrace_notrace_write, .llseek = ftrace_regex_lseek, .release = ftrace_notrace_release, @@ -1992,16 +1982,6 @@ ftrace_graph_open(struct inode *inode, struct file *file) return ret; } -static ssize_t -ftrace_graph_read(struct file *file, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - if (file->f_mode & FMODE_READ) - return seq_read(file, ubuf, cnt, ppos); - else - return -EPERM; -} - static int ftrace_set_func(unsigned long *array, int *idx, char *buffer) { @@ -2132,7 +2112,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, - .read = ftrace_graph_read, + .read = seq_read, .write = ftrace_graph_write, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- cgit v1.2.3-59-g8ed1b From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 13 Mar 2009 17:51:27 +0800 Subject: ftrace: remove struct list_head from struct dyn_ftrace Impact: save memory The struct dyn_ftrace table is very large, this patch will save about 50%. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 1 - kernel/trace/ftrace.c | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c146c1021a29..9d598bbf28a6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,7 +145,6 @@ enum { }; struct dyn_ftrace { - struct list_head list; unsigned long ip; /* address of mcount call-site */ unsigned long flags; struct dyn_arch_ftrace arch; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bf78a4c75c67..90d5729afeff 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -272,7 +272,7 @@ enum { static int ftrace_filtered; -static LIST_HEAD(ftrace_new_addrs); +static struct dyn_ftrace *ftrace_new_addrs; static DEFINE_MUTEX(ftrace_regex_lock); @@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - - list_add(&rec->list, &ftrace_new_addrs); + rec->flags = (unsigned long)ftrace_new_addrs; + ftrace_new_addrs = rec; return rec; } @@ -716,19 +716,21 @@ unsigned long ftrace_update_tot_cnt; static int ftrace_update_code(struct module *mod) { - struct dyn_ftrace *p, *t; + struct dyn_ftrace *p; cycle_t start, stop; start = ftrace_now(raw_smp_processor_id()); ftrace_update_cnt = 0; - list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { + while (ftrace_new_addrs) { /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; - list_del_init(&p->list); + p = ftrace_new_addrs; + ftrace_new_addrs = (struct dyn_ftrace *)p->flags; + p->flags = 0L; /* convert record (i.e, patch mcount-call with NOP) */ if (ftrace_code_disable(mod, p)) { -- cgit v1.2.3-59-g8ed1b From 0b966252d9e5d95ec2d11e63d7e55b42913aa5b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 23:42:42 +1030 Subject: cpumask: convert node_to_cpumask_map[] to cpumask_var_t Impact: fix (CONFIG_MAXSMP=y only) boot crash c032ef60d1aa9af33730b7a35bbea751b131adc1 "cpumask: convert node_to_cpumask_map[] to cpumask_var_t" didn't get this one conversion. There was a compile warning, but I missed it. Reported-by: Ingo Molnar Signed-off-by: Rusty Russell Cc: Mike Travis LKML-Reference: <200903132342.42813.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ce255e32a593..550df481accd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -61,7 +61,7 @@ const struct cpumask *cpumask_of_node(int node) dump_stack(); return cpu_online_mask; } - return &node_to_cpumask_map[node]; + return node_to_cpumask_map[node]; } EXPORT_SYMBOL(cpumask_of_node); #endif -- cgit v1.2.3-59-g8ed1b From 082edb7bf443eb8eda15b482d16ad9dd8137ad24 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Mar 2009 23:43:37 +1030 Subject: numa, cpumask: move numa_node_id default implementation to topology.h Impact: cleanup, potential bugfix Not sure what changed to expose this, but clearly that numa_node_id() doesn't belong in mmzone.h (the inline in gfp.h is probably overkill, too). In file included from include/linux/topology.h:34, from arch/x86/mm/numa.c:2: /home/rusty/patches-cpumask/linux-2.6/arch/x86/include/asm/topology.h:64:1: warning: "numa_node_id" redefined In file included from include/linux/topology.h:32, from arch/x86/mm/numa.c:2: include/linux/mmzone.h:770:1: warning: this is the location of the previous definition Signed-off-by: Rusty Russell Cc: Mike Travis LKML-Reference: <200903132343.37661.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- include/linux/gfp.h | 1 + include/linux/mmzone.h | 6 ------ include/linux/topology.h | 5 +++++ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index dd20cd78faa8..0bbc15f54536 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -4,6 +4,7 @@ #include #include #include +#include struct vm_area_struct; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1aca6cebbb78..e6aacf77986a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -764,12 +764,6 @@ extern int numa_zonelist_order_handler(struct ctl_table *, int, extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ -#include -/* Returns the number of the current Node. */ -#ifndef numa_node_id -#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) -#endif - #ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; diff --git a/include/linux/topology.h b/include/linux/topology.h index 16b7d6896ce9..7402c1a27c4f 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -196,4 +196,9 @@ int arch_update_cpu_topology(void); #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif +/* Returns the number of the current Node. */ +#ifndef numa_node_id +#define numa_node_id() (cpu_to_node(raw_smp_processor_id())) +#endif + #endif /* _LINUX_TOPOLOGY_H */ -- cgit v1.2.3-59-g8ed1b From f58ba100678f421bdcb000a3c71793f432dfab93 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Mar 2009 15:42:12 +0100 Subject: tracing/syscalls: support for syscalls tracing on x86 Extend x86 architecture syscall tracing support with syscall metadata table details. (The upcoming core syscall tracing modifications rely on this.) Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1236955332-10133-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ftrace.h | 7 +++++ arch/x86/kernel/ftrace.c | 63 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index db24c2278be0..bd2c6511c887 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,6 +28,13 @@ #endif +/* FIXME: I don't want to stay hardcoded */ +#ifdef CONFIG_X86_64 +# define FTRACE_SYSCALL_MAX 296 +#else +# define FTRACE_SYSCALL_MAX 333 +#endif + #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index a85da1764b1c..1d0d7f42efe3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -453,3 +453,66 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_FTRACE_SYSCALLS + +extern unsigned long __start_syscalls_metadata[]; +extern unsigned long __stop_syscalls_metadata[]; +extern unsigned long *sys_call_table; + +static struct syscall_metadata **syscalls_metadata; + +static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) +{ + struct syscall_metadata *start; + struct syscall_metadata *stop; + char str[KSYM_SYMBOL_LEN]; + + + start = (struct syscall_metadata *)__start_syscalls_metadata; + stop = (struct syscall_metadata *)__stop_syscalls_metadata; + kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); + + for ( ; start < stop; start++) { + if (start->name && !strcmp(start->name, str)) + return start; + } + return NULL; +} + +struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +void arch_init_ftrace_syscalls(void) +{ + int i; + struct syscall_metadata *meta; + unsigned long **psys_syscall_table = &sys_call_table; + static atomic_t refs; + + if (atomic_inc_return(&refs) != 1) + goto end; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + FTRACE_SYSCALL_MAX, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } + + for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + meta = find_syscall_meta(psys_syscall_table[i]); + syscalls_metadata[i] = meta; + } + return; + + /* Paranoid: avoid overflow */ +end: + atomic_dec(&refs); +} +#endif -- cgit v1.2.3-59-g8ed1b From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Mar 2009 15:42:11 +0100 Subject: tracing/syscalls: core infrastructure for syscalls tracing, enhancements Impact: new feature This adds the generic support for syscalls tracing. This is currently exploited through a devoted tracer but other tracing engines can use it. (They just have to play with {start,stop}_ftrace_syscalls() and use the display callbacks unless they want to override them.) The syscalls prototypes definitions are abused here to steal some metadata informations: - syscall name, param types, param names, number of params The syscall addr is not directly saved during this definition because we don't know if its prototype is available in the namespace. But we don't really need it. The arch has just to build a function able to resolve the syscall number to its metadata struct. The current tracer prints the syscall names, parameters names and values (and their types optionally). Currently the value is a raw hex but higher level values diplaying is on my TODO list. Signed-off-by: Frederic Weisbecker LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 11 ++- include/linux/ftrace.h | 14 +++- include/linux/syscalls.h | 60 +++++++++++++++- kernel/trace/trace.h | 17 +++++ kernel/trace/trace_syscalls.c | 146 +++++++++++++++++++++++++++++++++++--- 5 files changed, 234 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0e0f39be6c8b..d3bc3c86df6a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -77,6 +77,14 @@ #define TRACE_PRINTKS() #endif +#ifdef CONFIG_FTRACE_SYSCALLS +#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .; \ + *(__syscalls_metadata) \ + VMLINUX_SYMBOL(__stop_syscalls_metadata) = .; +#else +#define TRACE_SYSCALLS() +#endif + /* .data section */ #define DATA_DATA \ *(.data) \ @@ -99,7 +107,8 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ - FTRACE_EVENTS() + FTRACE_EVENTS() \ + TRACE_SYSCALLS() #define RO_DATA(align) \ . = ALIGN((align)); \ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c146c1021a29..6dc1c652447e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -506,13 +506,21 @@ static inline void trace_hw_branch_oops(void) {} /* * A syscall entry in the ftrace syscalls array. * - * @syscall_nr: syscall number + * @name: name of the syscall + * @nb_args: number of parameters it takes + * @types: list of types as strings + * @args: list of args as strings (args[i] matches types[i]) */ -struct syscall_trace_entry { - int syscall_nr; +struct syscall_metadata { + const char *name; + int nb_args; + const char **types; + const char **args; }; #ifdef CONFIG_FTRACE_SYSCALLS +extern void arch_init_ftrace_syscalls(void); +extern struct syscall_metadata *syscall_nr_to_meta(int nr); extern void start_ftrace_syscalls(void); extern void stop_ftrace_syscalls(void); extern void ftrace_syscall_enter(struct pt_regs *regs); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f9f900cfd066..0cff9bb80b02 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -65,6 +65,7 @@ struct old_linux_dirent; #include #include #include +#include #define __SC_DECL1(t1, a1) t1 a1 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) @@ -95,7 +96,46 @@ struct old_linux_dirent; #define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__) #define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__) +#ifdef CONFIG_FTRACE_SYSCALLS +#define __SC_STR_ADECL1(t, a) #a +#define __SC_STR_ADECL2(t, a, ...) #a, __SC_STR_ADECL1(__VA_ARGS__) +#define __SC_STR_ADECL3(t, a, ...) #a, __SC_STR_ADECL2(__VA_ARGS__) +#define __SC_STR_ADECL4(t, a, ...) #a, __SC_STR_ADECL3(__VA_ARGS__) +#define __SC_STR_ADECL5(t, a, ...) #a, __SC_STR_ADECL4(__VA_ARGS__) +#define __SC_STR_ADECL6(t, a, ...) #a, __SC_STR_ADECL5(__VA_ARGS__) + +#define __SC_STR_TDECL1(t, a) #t +#define __SC_STR_TDECL2(t, a, ...) #t, __SC_STR_TDECL1(__VA_ARGS__) +#define __SC_STR_TDECL3(t, a, ...) #t, __SC_STR_TDECL2(__VA_ARGS__) +#define __SC_STR_TDECL4(t, a, ...) #t, __SC_STR_TDECL3(__VA_ARGS__) +#define __SC_STR_TDECL5(t, a, ...) #t, __SC_STR_TDECL4(__VA_ARGS__) +#define __SC_STR_TDECL6(t, a, ...) #t, __SC_STR_TDECL5(__VA_ARGS__) + +#define SYSCALL_METADATA(sname, nb) \ + static const struct syscall_metadata __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("__syscalls_metadata"))) \ + __syscall_meta_##sname = { \ + .name = "sys"#sname, \ + .nb_args = nb, \ + .types = types_##sname, \ + .args = args_##sname, \ + } + +#define SYSCALL_DEFINE0(sname) \ + static const struct syscall_metadata __used \ + __attribute__((__aligned__(4))) \ + __attribute__((section("__syscalls_metadata"))) \ + __syscall_meta_##sname = { \ + .name = "sys_"#sname, \ + .nb_args = 0, \ + }; \ + asmlinkage long sys_##sname(void) + +#else #define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) +#endif + #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) @@ -117,10 +157,26 @@ struct old_linux_dirent; #endif #endif +#ifdef CONFIG_FTRACE_SYSCALLS +#define SYSCALL_DEFINEx(x, sname, ...) \ + static const char *types_##sname[] = { \ + __SC_STR_TDECL##x(__VA_ARGS__) \ + }; \ + static const char *args_##sname[] = { \ + __SC_STR_ADECL##x(__VA_ARGS__) \ + }; \ + SYSCALL_METADATA(sname, x); \ + __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) +#else +#define SYSCALL_DEFINEx(x, sname, ...) \ + __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) +#endif + #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS #define SYSCALL_DEFINE(name) static inline long SYSC_##name -#define SYSCALL_DEFINEx(x, name, ...) \ + +#define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)); \ static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__)); \ asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__)) \ @@ -134,7 +190,7 @@ struct old_linux_dirent; #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */ #define SYSCALL_DEFINE(name) asmlinkage long sys_##name -#define SYSCALL_DEFINEx(x, name, ...) \ +#define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__)) #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3d49daae47dc..d80ca0d464d9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -194,6 +194,19 @@ struct kmemtrace_free_entry { const void *ptr; }; +struct syscall_trace_enter { + struct trace_entry ent; + int nr; + unsigned long args[]; +}; + +struct syscall_trace_exit { + struct trace_entry ent; + int nr; + unsigned long ret; +}; + + /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: @@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ + IF_ASSIGN(var, ent, struct syscall_trace_enter, \ + TRACE_SYSCALL_ENTER); \ + IF_ASSIGN(var, ent, struct syscall_trace_exit, \ + TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 66cf97449af3..c72e599230ff 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,6 +1,5 @@ -#include #include - +#include #include #include "trace_output.h" @@ -8,6 +7,90 @@ static atomic_t refcount; +/* Our two options */ +enum { + TRACE_SYSCALLS_OPT_TYPES = 0x1, +}; + +static struct tracer_opt syscalls_opts[] = { + { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) }, + { } +}; + +static struct tracer_flags syscalls_flags = { + .val = 0, /* By default: no args types */ + .opts = syscalls_opts +}; + +enum print_line_t +print_syscall_enter(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_enter *trace; + struct syscall_metadata *entry; + int i, ret, syscall; + + trace_assign_type(trace, ent); + + syscall = trace->nr; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + goto end; + + ret = trace_seq_printf(s, "%s(", entry->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + for (i = 0; i < entry->nb_args; i++) { + /* parameter types */ + if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { + ret = trace_seq_printf(s, "%s ", entry->types[i]); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* parameter values */ + ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? ")" : ","); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + +end: + trace_seq_printf(s, "\n"); + return TRACE_TYPE_HANDLED; +} + +enum print_line_t +print_syscall_exit(struct trace_iterator *iter, int flags) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_exit *trace; + int syscall; + struct syscall_metadata *entry; + int ret; + + trace_assign_type(trace, ent); + + syscall = trace->nr; + + entry = syscall_nr_to_meta(syscall); + if (!entry) { + trace_seq_printf(s, "\n"); + return TRACE_TYPE_HANDLED; + } + + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace->ret); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + void start_ftrace_syscalls(void) { unsigned long flags; @@ -16,6 +99,7 @@ void start_ftrace_syscalls(void) if (atomic_inc_return(&refcount) != 1) goto out; + arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { @@ -48,20 +132,63 @@ out: void ftrace_syscall_enter(struct pt_regs *regs) { + struct syscall_trace_enter *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + int size; int syscall_nr; + int cpu; syscall_nr = syscall_get_nr(current, regs); - trace_printk("syscall %d enter\n", syscall_nr); + cpu = raw_smp_processor_id(); + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + + event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, + 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + + trace_current_buffer_unlock_commit(event, 0, 0); + trace_wake_up(); } void ftrace_syscall_exit(struct pt_regs *regs) { + struct syscall_trace_exit *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; int syscall_nr; + int cpu; syscall_nr = syscall_get_nr(current, regs); - trace_printk("syscall %d exit\n", syscall_nr); + cpu = raw_smp_processor_id(); + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + sizeof(*entry), 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + entry->ret = syscall_get_return_value(current, regs); + + trace_current_buffer_unlock_commit(event, 0, 0); + trace_wake_up(); } static int init_syscall_tracer(struct trace_array *tr) @@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr) } static struct trace_event syscall_enter_event = { - .type = TRACE_SYSCALL_ENTER, + .type = TRACE_SYSCALL_ENTER, + .trace = print_syscall_enter, }; static struct trace_event syscall_exit_event = { - .type = TRACE_SYSCALL_EXIT, + .type = TRACE_SYSCALL_EXIT, + .trace = print_syscall_exit, }; static struct tracer syscall_tracer __read_mostly = { - .name = "syscall", + .name = "syscall", .init = init_syscall_tracer, - .reset = reset_syscall_tracer + .reset = reset_syscall_tracer, + .flags = &syscalls_flags, }; __init int register_ftrace_syscalls(void) -- cgit v1.2.3-59-g8ed1b From ccd50dfd92ea2c4ba9e39531ac55db53393e783e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 17:02:17 +0100 Subject: tracing/syscalls: support for syscalls tracing on x86, fix Impact: build fix kernel/built-in.o: In function `ftrace_syscall_exit': (.text+0x76667): undefined reference to `syscall_nr_to_meta' ftrace.o is built: obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o But now a CONFIG_FTRACE_SYSCALLS dependency is needed too. Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Lai Jiangshan LKML-Reference: <1236401580-5758-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e6..84000eb931ff 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -66,7 +66,8 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o -- cgit v1.2.3-59-g8ed1b From b478b782e110fdb4135caa3062b6d687e989d994 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 13 Mar 2009 15:10:26 +0800 Subject: kallsyms, tracing: output more proper symbol name Impact: bugfix, output more reliable symbol lookup result Debug tools(dump_stack(), ftrace...) are like to print out symbols. But it is always print out the first aliased symbol.(Aliased symbols are symbols with the same address), and the first aliased symbol is sometime not proper. # echo function_graph > current_tracer # cat trace ...... 1) 1.923 us | select_nohz_load_balancer(); 1) + 76.692 us | } 1) | default_idle() { 1) ==========> | __irqentry_text_start() { 1) 0.000 us | native_apic_mem_write(); 1) | irq_enter() { 1) 0.000 us | idle_cpu(); 1) | tick_check_idle() { 1) 0.000 us | tick_check_oneshot_broadcast(); 1) | tick_nohz_stop_idle() { ...... It's very embarrassing, it ouputs "__irqentry_text_start()", actually, it should output "smp_apic_timer_interrupt()". (these two symbol are the same address, but "__irqentry_text_start" is deemed to the first aliased symbol by scripts/kallsyms) This patch puts symbols like "__irqentry_text_start" to the second aliased symbols. And a more proper symbol name becomes the first. Aliased symbols mostly come from linker script. The solution is guessing "is this symbol defined in linker script", the symbols defined in linker script will not become the first aliased symbol. And if symbols are found to be equal in this "linker script provided" criteria, symbols are sorted by the number of prefix underscores. Signed-off-by: Lai Jiangshan Acked-by: Sam Ravnborg Reviewed-by: Paulo Marques LKML-Reference: <49BA06E2.7080807@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- scripts/kallsyms.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index ad2434b26970..6654cbed965b 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -500,6 +500,51 @@ static void optimize_token_table(void) optimize_result(); } +/* guess for "linker script provide" symbol */ +static int may_be_linker_script_provide_symbol(const struct sym_entry *se) +{ + const char *symbol = (char *)se->sym + 1; + int len = se->len - 1; + + if (len < 8) + return 0; + + if (symbol[0] != '_' || symbol[1] != '_') + return 0; + + /* __start_XXXXX */ + if (!memcmp(symbol + 2, "start_", 6)) + return 1; + + /* __stop_XXXXX */ + if (!memcmp(symbol + 2, "stop_", 5)) + return 1; + + /* __end_XXXXX */ + if (!memcmp(symbol + 2, "end_", 4)) + return 1; + + /* __XXXXX_start */ + if (!memcmp(symbol + len - 6, "_start", 6)) + return 1; + + /* __XXXXX_end */ + if (!memcmp(symbol + len - 4, "_end", 4)) + return 1; + + return 0; +} + +static int prefix_underscores_count(const char *str) +{ + const char *tail = str; + + while (*tail != '_') + tail++; + + return tail - str; +} + static int compare_symbols(const void *a, const void *b) { const struct sym_entry *sa; @@ -521,6 +566,18 @@ static int compare_symbols(const void *a, const void *b) if (wa != wb) return wa - wb; + /* sort by "linker script provide" type */ + wa = may_be_linker_script_provide_symbol(sa); + wb = may_be_linker_script_provide_symbol(sb); + if (wa != wb) + return wa - wb; + + /* sort by the number of prefix underscores */ + wa = prefix_underscores_count((const char *)sa->sym + 1); + wb = prefix_underscores_count((const char *)sb->sym + 1); + if (wa != wb) + return wa - wb; + /* sort by initial order, so that other symbols are left undisturbed */ return sa->start_pos - sb->start_pos; } -- cgit v1.2.3-59-g8ed1b From adf26f84a62b492e002d3b75af671f23ddd3be0a Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 14 Mar 2009 12:08:50 +0100 Subject: fix regression from "vsprintf: unify the format decoding layer for its 3 users" Jeremy Fitzhardinge reported: > Change fef20d9c1380f04ba9492d6463148db07b413708, "vsprintf: > unify the format decoding layer for its 3 users", causes a > regression in xenbus which results in no devices getting > attached to a new domain. %.*s is broken - fix it. Reported-by: Jeremy Fitzhardinge Cc: Frederic Weisbecker Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index dc1674377009..708e505ce81f 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -843,7 +843,7 @@ precision: spec->precision = 0; } else if (*fmt == '*') { /* it's the next argument */ - spec->type = FORMAT_TYPE_WITDH; + spec->type = FORMAT_TYPE_PRECISION; return ++fmt - start; } } -- cgit v1.2.3-59-g8ed1b From ed681a91ab805341675d166a9592551093c0a2d9 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 14 Mar 2009 12:08:50 +0100 Subject: vsprintf: unify the format decoding layer for its 3 users, cleanup Impact: cleanup Rename FORMAT_TYPE_WITDH to => FORMAT_TYPE_WIDTH Cc: Frederic Weisbecker Cc: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 708e505ce81f..be3001f912e4 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -398,7 +398,7 @@ static noinline char* put_dec(char *buf, unsigned long long num) enum format_type { FORMAT_TYPE_NONE, /* Just a string part */ - FORMAT_TYPE_WITDH, + FORMAT_TYPE_WIDTH, FORMAT_TYPE_PRECISION, FORMAT_TYPE_CHAR, FORMAT_TYPE_STR, @@ -770,7 +770,7 @@ static int format_decode(const char *fmt, struct printf_spec *spec) const char *start = fmt; /* we finished early by reading the field width */ - if (spec->type == FORMAT_TYPE_WITDH) { + if (spec->type == FORMAT_TYPE_WIDTH) { if (spec->field_width < 0) { spec->field_width = -spec->field_width; spec->flags |= LEFT; @@ -828,7 +828,7 @@ static int format_decode(const char *fmt, struct printf_spec *spec) spec->field_width = skip_atoi(&fmt); else if (*fmt == '*') { /* it's the next argument */ - spec->type = FORMAT_TYPE_WITDH; + spec->type = FORMAT_TYPE_WIDTH; return ++fmt - start; } @@ -1002,7 +1002,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; } - case FORMAT_TYPE_WITDH: + case FORMAT_TYPE_WIDTH: spec.field_width = va_arg(args, int); break; @@ -1306,7 +1306,7 @@ do { \ case FORMAT_TYPE_NONE: break; - case FORMAT_TYPE_WITDH: + case FORMAT_TYPE_WIDTH: case FORMAT_TYPE_PRECISION: save_arg(int); break; @@ -1472,7 +1472,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) break; } - case FORMAT_TYPE_WITDH: + case FORMAT_TYPE_WIDTH: spec.field_width = get_arg(int); break; -- cgit v1.2.3-59-g8ed1b From f55aa59106b66cd547c8f296e0b3430ad76554c5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 23 Feb 2009 12:47:25 +0200 Subject: UBIFS: fix bug where page is marked uptodate when out of space UBIFS fast path in write_begin may mark a page up to date and then discover that there may not be enough space to do the write, and so fall back to a slow path. The slow path tries harder, but may still find no space - leaving the page marked up to date, when it is not. This patch ensures that the page is marked not up to date in that case. The bug that this patch fixes becomes evident when the write is into a hole (sparse file) or is at the end of the file and a subsequent read is off the end of the file. In both cases, the file system should return zeros but was instead returning the page that had not been written because the file system was out of space. Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/file.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 93b6de51f261..4e7f0aca9ebc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct ubifs_inode *ui = ubifs_inode(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; struct page *page; ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); @@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { /* * We change whole page so no need to load it. But we * have to set the @PG_checked flag to make the further @@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * the media. */ SetPageChecked(page); - else { + skipped_read = 1; + } else { err = do_readpage(page); if (err) { unlock_page(page); @@ -469,6 +471,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, err = allocate_budget(c, page, ui, appending); if (unlikely(err)) { ubifs_assert(err == -ENOSPC); + /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the -- cgit v1.2.3-59-g8ed1b From da5309cd28ffda6ed8a4147bd14f1e4fbbd6503d Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Thu, 12 Mar 2009 09:33:37 -0500 Subject: Fix xfs debug build breakage by pushing xfs_error.h after xfs_mount.h, which it depends on. Reported-by: Stephen Rothwell Reviewed-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/support/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 930bb34804b0..3f3610a7ee05 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -17,7 +17,6 @@ */ #include #include "debug.h" -#include "xfs_error.h" /* xfs_mount.h drags a lot of crap in, sorry.. */ #include "xfs_sb.h" @@ -25,6 +24,7 @@ #include "xfs_ag.h" #include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_error.h" static char message[1024]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); -- cgit v1.2.3-59-g8ed1b From b221337ae4ef9baff84d6d5ecb806e79a5597329 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 15 Mar 2009 17:20:22 +0200 Subject: UBIFS: fix bogus assertion Empty journal head LEBs are accounted as taken empty as well, so the GC LEB does not have to be the only taken empty LEB when nounting/remounting. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1182b66a5491..03cd9ac4dcb2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1318,11 +1318,15 @@ static int mount_ubifs(struct ubifs_info *c) else { c->need_recovery = 0; ubifs_msg("recovery completed"); - /* GC LEB has to be empty and taken at this point */ - ubifs_assert(c->lst.taken_empty_lebs == 1); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); } } else - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) @@ -1775,7 +1779,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) c->bu.buf = NULL; } - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); return 0; } -- cgit v1.2.3-59-g8ed1b From 3e7be3fb40296aab48a91ec599f22d2c5e8a4351 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:11:46 +1030 Subject: cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: cris Impact: cleanup, futureproof In fact, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in various places. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar --- arch/cris/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index 04d48dd91ddf..b712f4934c4b 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -166,7 +166,7 @@ void __init setup_arch(char **cmdline_p) static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? (void *)(int)(*pos + 1): NULL; + return *pos < nr_cpu_ids ? (void *)(int)(*pos + 1) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) -- cgit v1.2.3-59-g8ed1b From afc6ca01f6da831ff09f4dabdef3b50f114e9e1e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:11:47 +1030 Subject: cpumask: Use accessors code.: cris Impact: use new API Use the accessors rather than frobbing bits directly. Most of this is in arch code I haven't even compiled, but is straightforward. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- arch/cris/arch-v32/kernel/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 9dac17334640..b47764c3308f 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -98,9 +98,9 @@ void __devinit smp_prepare_boot_cpu(void) SUPP_BANK_SEL(2); SUPP_REG_WR(RW_MM_TLB_PGD, pgd); - cpu_set(0, cpu_online_map); + set_cpu_online(0, true); cpu_set(0, phys_cpu_present_map); - cpu_set(0, cpu_possible_map); + set_cpu_possible(0, true); } void __init smp_cpus_done(unsigned int max_cpus) -- cgit v1.2.3-59-g8ed1b From b9d65c04773850ff3a82f69d43981be650e658a1 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:11:47 +1030 Subject: cpumask: use mm_cpumask() wrapper: cris Makes code futureproof against the impending change to mm->cpu_vm_mask. It's also a chance to use the new cpumask_ ops which take a pointer (the older ones are deprecated, but there's no hurry for arch code). Signed-off-by: Rusty Russell --- arch/cris/arch-v32/kernel/smp.c | 6 +++--- arch/cris/arch-v32/mm/tlb.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index b47764c3308f..dc31b04d0827 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -232,7 +232,7 @@ void flush_tlb_common(struct mm_struct* mm, struct vm_area_struct* vma, unsigned cpumask_t cpu_mask; spin_lock_irqsave(&tlbstate_lock, flags); - cpu_mask = (mm == FLUSH_ALL ? CPU_MASK_ALL : mm->cpu_vm_mask); + cpu_mask = (mm == FLUSH_ALL ? cpu_all_mask : *mm_cpumask(mm)); cpu_clear(smp_processor_id(), cpu_mask); flush_mm = mm; flush_vma = vma; @@ -252,8 +252,8 @@ void flush_tlb_mm(struct mm_struct *mm) __flush_tlb_mm(mm); flush_tlb_common(mm, FLUSH_ALL, 0); /* No more mappings in other CPUs */ - cpus_clear(mm->cpu_vm_mask); - cpu_set(smp_processor_id(), mm->cpu_vm_mask); + cpumask_clear(mm_cpumask(mm)); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); } void flush_tlb_page(struct vm_area_struct *vma, diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c index 55ade36fe8a8..6779bcb28ab0 100644 --- a/arch/cris/arch-v32/mm/tlb.c +++ b/arch/cris/arch-v32/mm/tlb.c @@ -185,7 +185,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Make sure there is a MMU context. */ spin_lock(&mmu_context_lock); get_mmu_context(next); - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); spin_unlock(&mmu_context_lock); /* -- cgit v1.2.3-59-g8ed1b From 4c395bdd3f2ca8f7e8efad881e16071182c3b8ca Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Wed, 4 Mar 2009 11:55:28 -0800 Subject: hp-wmi: notify of a potential docking state change on resume It is possible that the system gets docked or undocked while it's suspended. Generate an input event on resume to notify user space if there was a state change. As it is a switch, we can generate the event unconditionally; the input layer will only pass it on if there is an actual change. Signed-off-by: Frans Pop Cc: Matthew Garrett Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- drivers/platform/x86/hp-wmi.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index f41135f2fb29..50d9019de2be 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -53,6 +53,7 @@ MODULE_ALIAS("wmi:5FB7F034-2C63-45e9-BE91-3D44E2C707E4"); static int __init hp_wmi_bios_setup(struct platform_device *device); static int __exit hp_wmi_bios_remove(struct platform_device *device); +static int hp_wmi_resume_handler(struct platform_device *device); struct bios_args { u32 signature; @@ -101,6 +102,7 @@ static struct platform_driver hp_wmi_driver = { }, .probe = hp_wmi_bios_setup, .remove = hp_wmi_bios_remove, + .resume = hp_wmi_resume_handler, }; static int hp_wmi_perform_query(int query, int write, int value) @@ -487,6 +489,29 @@ static int __exit hp_wmi_bios_remove(struct platform_device *device) return 0; } +static int hp_wmi_resume_handler(struct platform_device *device) +{ + struct key_entry *key; + + /* + * Docking state may have changed while suspended, so trigger + * an input event for the current state. As this is a switch, + * the input layer will only actually pass it on if the state + * changed. + */ + for (key = hp_wmi_keymap; key->type != KE_END; key++) { + switch (key->type) { + case KE_SW: + input_report_switch(hp_wmi_input_dev, key->keycode, + hp_wmi_dock_state()); + input_sync(hp_wmi_input_dev); + break; + } + } + + return 0; +} + static int __init hp_wmi_init(void) { int err; -- cgit v1.2.3-59-g8ed1b From 91887a362984324e254473e92820758c8e658f78 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:19:37 +1030 Subject: cpumask: arch_send_call_function_ipi_mask: parisc We're weaning the core code off handing cpumask's around on-stack. This introduces arch_send_call_function_ipi_mask(), and by defining it, the old arch_send_call_function_ipi is defined by the core code. We also take the chance to change send_IPI_mask() and use the new for_each_cpu() iterator. Signed-off-by: Rusty Russell --- arch/parisc/include/asm/smp.h | 3 ++- arch/parisc/kernel/smp.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h index 6ef4b7867b1b..21eb45a52629 100644 --- a/arch/parisc/include/asm/smp.h +++ b/arch/parisc/include/asm/smp.h @@ -29,7 +29,8 @@ extern void smp_send_reschedule(int cpu); extern void smp_send_all_nop(void); extern void arch_send_call_function_single_ipi(int cpu); -extern void arch_send_call_function_ipi(cpumask_t mask); +extern void arch_send_call_function_ipi_mask(const struct cpumask *mask); +#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask #endif /* !ASSEMBLY */ diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 9995d7ed5819..ba9b6808e2d9 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -214,11 +214,11 @@ ipi_send(int cpu, enum ipi_message_type op) } static void -send_IPI_mask(cpumask_t mask, enum ipi_message_type op) +send_IPI_mask(const struct cpumask *mask, enum ipi_message_type op) { int cpu; - for_each_cpu_mask(cpu, mask) + for_each_cpu(cpu, mask) ipi_send(cpu, op); } @@ -257,7 +257,7 @@ smp_send_all_nop(void) send_IPI_allbutself(IPI_NOP); } -void arch_send_call_function_ipi(cpumask_t mask) +void arch_send_call_function_ipi_mask(const struct cpumask *mask) { send_IPI_mask(mask, IPI_CALL_FUNC); } -- cgit v1.2.3-59-g8ed1b From bd071e1a371d31db243edc4714ff9e8d1ea1309e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:19:37 +1030 Subject: cpumask: prepare for iterators to only go to nr_cpu_ids/nr_cpumask_bits.: parisc Impact: cleanup, futureproof In fact, all cpumask ops will only be valid (in general) for bit numbers < nr_cpu_ids. So use that instead of NR_CPUS in various places. This is always safe: no cpu number can be >= nr_cpu_ids, and nr_cpu_ids is initialized to NR_CPUS at boot. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis Acked-by: Ingo Molnar --- arch/parisc/kernel/irq.c | 4 ++-- arch/parisc/kernel/processor.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 29e70e16ede8..103752a8fb94 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -311,12 +311,12 @@ unsigned long txn_alloc_addr(unsigned int virt_irq) next_cpu++; /* assign to "next" CPU we want this bugger on */ /* validate entry */ - while ((next_cpu < NR_CPUS) && + while ((next_cpu < nr_cpu_ids) && (!per_cpu(cpu_data, next_cpu).txn_addr || !cpu_online(next_cpu))) next_cpu++; - if (next_cpu >= NR_CPUS) + if (next_cpu >= nr_cpu_ids) next_cpu = 0; /* nothing else, assign monarch */ return txn_affinity_addr(virt_irq, next_cpu); diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index ecb609342feb..44a1a509d894 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -100,8 +100,8 @@ static int __cpuinit processor_probe(struct parisc_device *dev) struct cpuinfo_parisc *p; #ifdef CONFIG_SMP - if (num_online_cpus() >= NR_CPUS) { - printk(KERN_INFO "num_online_cpus() >= NR_CPUS\n"); + if (num_online_cpus() >= nr_cpu_ids) { + printk(KERN_INFO "num_online_cpus() >= nr_cpu_ids\n"); return 1; } #else -- cgit v1.2.3-59-g8ed1b From 9bc181d8d7cb6462de0c315e364780ad275f7c57 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Mar 2009 14:19:38 +1030 Subject: cpumask: Use accessors code.: parisc Impact: use new API Use the accessors rather than frobbing bits directly. Most of this is in arch code I haven't even compiled, but it is mostly straightforward. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- arch/parisc/kernel/processor.c | 2 +- arch/parisc/kernel/smp.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 44a1a509d894..bdbabfb87347 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -214,7 +214,7 @@ static int __cpuinit processor_probe(struct parisc_device *dev) */ #ifdef CONFIG_SMP if (cpuid) { - cpu_set(cpuid, cpu_present_map); + set_cpu_present(cpuid, true); cpu_up(cpuid); } #endif diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index ba9b6808e2d9..869197992f97 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -113,7 +113,7 @@ halt_processor(void) { /* REVISIT : redirect I/O Interrupts to another CPU? */ /* REVISIT : does PM *know* this CPU isn't available? */ - cpu_clear(smp_processor_id(), cpu_online_map); + set_cpu_online(smp_processor_id(), false); local_irq_disable(); for (;;) ; @@ -296,13 +296,14 @@ smp_cpu_init(int cpunum) mb(); /* Well, support 2.4 linux scheme as well. */ - if (cpu_test_and_set(cpunum, cpu_online_map)) + if (cpu_isset(cpunum, cpu_online_map)) { extern void machine_halt(void); /* arch/parisc.../process.c */ printk(KERN_CRIT "CPU#%d already initialized!\n", cpunum); machine_halt(); } + set_cpu_online(cpunum, true); /* Initialise the idle task for this CPU */ atomic_inc(&init_mm.mm_count); @@ -424,8 +425,8 @@ void __init smp_prepare_boot_cpu(void) /* Setup BSP mappings */ printk(KERN_INFO "SMP: bootstrap CPU ID is %d\n", bootstrap_processor); - cpu_set(bootstrap_processor, cpu_online_map); - cpu_set(bootstrap_processor, cpu_present_map); + set_cpu_online(bootstrap_processor, true); + set_cpu_present(bootstrap_processor, true); } @@ -436,8 +437,7 @@ void __init smp_prepare_boot_cpu(void) */ void __init smp_prepare_cpus(unsigned int max_cpus) { - cpus_clear(cpu_present_map); - cpu_set(0, cpu_present_map); + init_cpu_present(cpumask_of(0)); parisc_max_cpus = max_cpus; if (!max_cpus) -- cgit v1.2.3-59-g8ed1b From 21b699c89545ed94d9a1c6fcc8ac704784f68f40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:19:29 +0100 Subject: xfs: cleanup log unmount handling Kill the current xfs_log_unmount wrapper and opencode the two function calls in the only caller. Rename the current xfs_log_unmount_dealloc to xfs_log_unmount as it undoes xfs_log_mount and the new name makes that more clear. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 15 +-------------- fs/xfs/xfs_log.h | 3 +-- fs/xfs/xfs_mount.c | 5 +++-- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c8f300897728..25faa3fe83d2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -634,19 +634,6 @@ xfs_log_mount_finish(xfs_mount_t *mp) return error; } -/* - * Unmount processing for the log. - */ -int -xfs_log_unmount(xfs_mount_t *mp) -{ - int error; - - error = xfs_log_unmount_write(mp); - xfs_log_unmount_dealloc(mp); - return error; -} - /* * Final log writes as part of unmount. * @@ -797,7 +784,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) * and deallocate the log as the aild references the log. */ void -xfs_log_unmount_dealloc(xfs_mount_t *mp) +xfs_log_unmount(xfs_mount_t *mp) { xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 8a3e84e900a3..d0c9baa50b1a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -170,9 +170,8 @@ int xfs_log_write(struct xfs_mount *mp, int nentries, xfs_log_ticket_t ticket, xfs_lsn_t *start_lsn); -int xfs_log_unmount(struct xfs_mount *mp); int xfs_log_unmount_write(struct xfs_mount *mp); -void xfs_log_unmount_dealloc(struct xfs_mount *mp); +void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); int xfs_log_need_covered(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 664961e45e02..2549a235dad2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1194,7 +1194,7 @@ xfs_mountfs( out_rele_rip: IRELE(rip); out_log_dealloc: - xfs_log_unmount_dealloc(mp); + xfs_log_unmount(mp); out_free_perag: xfs_free_perag(mp); out_remove_uuid: @@ -1280,7 +1280,8 @@ xfs_unmountfs( "Freespace may not be correct on next mount."); xfs_unmountfs_writesb(mp); xfs_unmountfs_wait(mp); /* wait for async bufs */ - xfs_log_unmount(mp); /* Done! No more fs ops. */ + xfs_log_unmount_write(mp); + xfs_log_unmount(mp); if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) uuid_table_remove(&mp->m_sb.sb_uuid); -- cgit v1.2.3-59-g8ed1b From dd0bbad81c8d02315a5035d3d6ea441dd1254dc1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:19:59 +0100 Subject: xfs: remove another leftover of the old inode log item format There's another little snipplet of code left from the handling of the old inode log item format in xlog_recover_do_inode_trans. Kill it as it can't be reached anymore. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ceeba45e0224..ac47c5f2bcfa 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2511,16 +2511,10 @@ xlog_recover_do_inode_trans( } write_inode_buffer: - if (ITEM_TYPE(item) == XFS_LI_INODE) { - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); - } else { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); - } - + ASSERT(bp->b_mount == NULL || bp->b_mount == mp); + bp->b_mount = mp; + XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); + xfs_bdwrite(mp, bp); error: if (need_free) kmem_free(in_f); -- cgit v1.2.3-59-g8ed1b From ff0205e032b9733bb634ad5dadc79a0f6d30c721 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:20:52 +0100 Subject: xfs: cleanup xlog_recover_do_trans Change the big if-elsif-else block handling the different item types into a more natural switch, remove assignments in conditionals and remove an out of place comment from centuries ago on IRIX. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 65 +++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ac47c5f2bcfa..73584444c194 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2763,51 +2763,48 @@ xlog_recover_do_trans( int error = 0; xlog_recover_item_t *item, *first_item; - if ((error = xlog_recover_reorder_trans(trans))) + error = xlog_recover_reorder_trans(trans); + if (error) return error; + first_item = item = trans->r_itemq; do { - /* - * we don't need to worry about the block number being - * truncated in > 1 TB buffers because in user-land, - * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so - * the blknos will get through the user-mode buffer - * cache properly. The only bad case is o32 kernels - * where xfs_daddr_t is 32-bits but mount will warn us - * off a > 1 TB filesystem before we get here. - */ - if ((ITEM_TYPE(item) == XFS_LI_BUF)) { - if ((error = xlog_recover_do_buffer_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) { - if ((error = xlog_recover_do_inode_trans(log, item, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFI) { - if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFD) { + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + error = xlog_recover_do_buffer_trans(log, item, pass); + break; + case XFS_LI_INODE: + error = xlog_recover_do_inode_trans(log, item, pass); + break; + case XFS_LI_EFI: + error = xlog_recover_do_efi_trans(log, item, + trans->r_lsn, pass); + break; + case XFS_LI_EFD: xlog_recover_do_efd_trans(log, item, pass); - } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { - if ((error = xlog_recover_do_dquot_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { - if ((error = xlog_recover_do_quotaoff_trans(log, item, - pass))) - break; - } else { - xlog_warn("XFS: xlog_recover_do_trans"); + error = 0; + break; + case XFS_LI_DQUOT: + error = xlog_recover_do_dquot_trans(log, item, pass); + break; + case XFS_LI_QUOTAOFF: + error = xlog_recover_do_quotaoff_trans(log, item, + pass); + break; + default: + xlog_warn( + "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); ASSERT(0); error = XFS_ERROR(EIO); break; } + + if (error) + return error; item = item->ri_next; } while (first_item != item); - return error; + return 0; } /* -- cgit v1.2.3-59-g8ed1b From 076e6acb8f0d9532ee6c50512c1927c0a8e34f2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:24:13 +0100 Subject: xfs: cleanup xlog_bread Most callers of xlog_bread need to call xlog_align to get the actual offset. Consolidate that call into the main xlog_bread and provide a _xlog_bread for those few that don't want the actual offset. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/xfs_log_priv.h | 1 - fs/xfs/xfs_log_recover.c | 225 +++++++++++++++++++++++++++++------------------ 2 files changed, 138 insertions(+), 88 deletions(-) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 3670d48fd774..bcad5f4c1fd1 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -455,7 +455,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log); extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); -extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); extern kmem_zone_t *xfs_log_ticket_zone; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 73584444c194..7ba450116d4f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -94,12 +94,30 @@ xlog_put_bp( xfs_buf_free(bp); } +STATIC xfs_caddr_t +xlog_align( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp) +{ + xfs_caddr_t ptr; + + if (!log->l_sectbb_log) + return XFS_BUF_PTR(bp); + + ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); + ASSERT(XFS_BUF_SIZE(bp) >= + BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); + return ptr; +} + /* * nbblks should be uint, but oh well. Just want to catch that 32-bit length. */ -int -xlog_bread( +STATIC int +xlog_bread_noalign( xlog_t *log, xfs_daddr_t blk_no, int nbblks, @@ -137,6 +155,24 @@ xlog_bread( return error; } +STATIC int +xlog_bread( + xlog_t *log, + xfs_daddr_t blk_no, + int nbblks, + xfs_buf_t *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + /* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. @@ -180,24 +216,6 @@ xlog_bwrite( return error; } -STATIC xfs_caddr_t -xlog_align( - xlog_t *log, - xfs_daddr_t blk_no, - int nbblks, - xfs_buf_t *bp) -{ - xfs_caddr_t ptr; - - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); - - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); - return ptr; -} - #ifdef DEBUG /* * dump debug superblock and log record information @@ -321,9 +339,9 @@ xlog_find_cycle_start( mid_blk = BLK_AVG(first_blk, *last_blk); while (mid_blk != first_blk && mid_blk != *last_blk) { - if ((error = xlog_bread(log, mid_blk, 1, bp))) + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) return error; - offset = xlog_align(log, mid_blk, 1, bp); mid_cycle = xlog_get_cycle(offset); if (mid_cycle == cycle) { *last_blk = mid_blk; @@ -379,10 +397,10 @@ xlog_find_verify_cycle( bcount = min(bufblks, (start_blk + nbblks - i)); - if ((error = xlog_bread(log, i, bcount, bp))) + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) goto out; - buf = xlog_align(log, i, bcount, bp); for (j = 0; j < bcount; j++) { cycle = xlog_get_cycle(buf); if (cycle == stop_on_cycle_no) { @@ -436,9 +454,9 @@ xlog_find_verify_log_record( return ENOMEM; smallmem = 1; } else { - if ((error = xlog_bread(log, start_blk, num_blks, bp))) + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) goto out; - offset = xlog_align(log, start_blk, num_blks, bp); offset += ((num_blks - 1) << BBSHIFT); } @@ -453,9 +471,9 @@ xlog_find_verify_log_record( } if (smallmem) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto out; - offset = xlog_align(log, i, 1, bp); } head = (xlog_rec_header_t *)offset; @@ -559,15 +577,18 @@ xlog_find_head( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); + first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ - if ((error = xlog_bread(log, last_blk, 1, bp))) + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, last_blk, 1, bp); + last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); @@ -817,9 +838,10 @@ xlog_find_tail( if (!bp) return ENOMEM; if (*head_blk == 0) { /* special case */ - if ((error = xlog_bread(log, 0, 1, bp))) + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, 0, 1, bp); + if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ @@ -832,9 +854,10 @@ xlog_find_tail( */ ASSERT(*head_blk < INT_MAX); for (i = (int)(*head_blk) - 1; i >= 0; i--) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 1; break; @@ -848,9 +871,10 @@ xlog_find_tail( */ if (!found) { for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto bread_err; - offset = xlog_align(log, i, 1, bp); + if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { found = 2; @@ -922,10 +946,10 @@ xlog_find_tail( if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; - if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) goto bread_err; - } - offset = xlog_align(log, umount_data_blk, 1, bp); + op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { /* @@ -1017,9 +1041,10 @@ xlog_find_zeroed( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); + first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; @@ -1028,9 +1053,10 @@ xlog_find_zeroed( } /* check partially zeroed log */ - if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, log_bbnum-1, 1, bp); + last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); @@ -1152,10 +1178,10 @@ xlog_write_log_records( */ balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); if (balign != start_block) { - if ((error = xlog_bread(log, start_block, 1, bp))) { - xlog_put_bp(bp); - return error; - } + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + j = start_block - balign; } @@ -1175,10 +1201,14 @@ xlog_write_log_records( balign = BBTOB(ealign - start_block); error = XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); - if (!error) - error = xlog_bread(log, ealign, sectbb, bp); - if (!error) - error = XFS_BUF_SET_PTR(bp, offset, bufblks); + if (error) + break; + + error = xlog_bread_noalign(log, ealign, sectbb, bp); + if (error) + break; + + error = XFS_BUF_SET_PTR(bp, offset, bufblks); if (error) break; } @@ -1195,6 +1225,8 @@ xlog_write_log_records( start_block += endcount; j = 0; } + + out_put_bp: xlog_put_bp(bp); return error; } @@ -3481,9 +3513,11 @@ xlog_do_recovery_pass( hbp = xlog_get_bp(log, 1); if (!hbp) return ENOMEM; - if ((error = xlog_bread(log, tail_blk, 1, hbp))) + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) goto bread_err1; - offset = xlog_align(log, tail_blk, 1, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) @@ -3517,9 +3551,10 @@ xlog_do_recovery_pass( memset(rhash, 0, sizeof(rhash)); if (tail_blk <= head_blk) { for (blk_no = tail_blk; blk_no < head_blk; ) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) @@ -3527,10 +3562,11 @@ xlog_do_recovery_pass( /* blocks in data section */ bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no + hblks, bblks, dbp); + error = xlog_bread(log, blk_no + hblks, bblks, dbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no + hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, rhead, offset, pass))) @@ -3553,10 +3589,10 @@ xlog_do_recovery_pass( wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { /* Read header in one read */ - error = xlog_bread(log, blk_no, hblks, hbp); + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); } else { /* This LR is split across physical log end */ if (blk_no != log->l_logBBsize) { @@ -3564,12 +3600,13 @@ xlog_do_recovery_pass( ASSERT(blk_no <= INT_MAX); split_hblks = log->l_logBBsize - (int)blk_no; ASSERT(split_hblks > 0); - if ((error = xlog_bread(log, blk_no, - split_hblks, hbp))) + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_hblks, hbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3587,14 +3624,19 @@ xlog_do_recovery_pass( error = XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks), BBTOB(hblks - split_hblks)); - if (!error) - error = xlog_bread(log, 0, - wrapped_hblks, hbp); - if (!error) - error = XFS_BUF_SET_PTR(hbp, bufaddr, + if (error) + goto bread_err2; + + error = xlog_bread_noalign(log, 0, + wrapped_hblks, hbp); + if (error) + goto bread_err2; + + error = XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); if (error) goto bread_err2; + if (!offset) offset = xlog_align(log, 0, wrapped_hblks, hbp); @@ -3610,10 +3652,10 @@ xlog_do_recovery_pass( /* Read in data for log record */ if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp); + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, bblks, dbp); } else { /* This log record is split across the * physical end of log */ @@ -3627,12 +3669,13 @@ xlog_do_recovery_pass( split_bblks = log->l_logBBsize - (int)blk_no; ASSERT(split_bblks > 0); - if ((error = xlog_bread(log, blk_no, - split_bblks, dbp))) + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_bblks, dbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3649,15 +3692,19 @@ xlog_do_recovery_pass( error = XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks), BBTOB(bblks - split_bblks)); - if (!error) - error = xlog_bread(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (!error) - error = XFS_BUF_SET_PTR(dbp, bufaddr, - h_size); if (error) goto bread_err2; + + error = xlog_bread_noalign(log, wrapped_hblks, + bblks - split_bblks, + dbp); + if (error) + goto bread_err2; + + error = XFS_BUF_SET_PTR(dbp, bufaddr, h_size); + if (error) + goto bread_err2; + if (!offset) offset = xlog_align(log, wrapped_hblks, bblks - split_bblks, dbp); @@ -3674,17 +3721,21 @@ xlog_do_recovery_pass( /* read first part of physical log */ while (blk_no < head_blk) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no+hblks, bblks, dbp); + xlog_unpack_data(rhead, offset, log); if ((error = xlog_recover_process_data(log, rhash, rhead, offset, pass))) -- cgit v1.2.3-59-g8ed1b From 8fab451e3cfe02a5e3dfc4bab3cfb975bc11fc09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:24:46 +0100 Subject: xfs: kill vn_atime_* helpers. Two out of three are unused already, and the third is better done open-coded with a comment describing what's going on here. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_vnode.h | 19 ------------------- fs/xfs/xfs_itable.c | 7 ++++++- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index ea4675c48209..de9dc747b451 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -57,25 +57,6 @@ static inline int VN_BAD(struct inode *vp) return is_bad_inode(vp); } -/* - * Extracting atime values in various formats - */ -static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime) -{ - bs_atime->tv_sec = vp->i_atime.tv_sec; - bs_atime->tv_nsec = vp->i_atime.tv_nsec; -} - -static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts) -{ - *ts = vp->i_atime; -} - -static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt) -{ - *tt = vp->i_atime.tv_sec; -} - /* * Some useful predicates. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index cf98a805ec90..24f1d139a636 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -83,7 +83,12 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime); + /* + * We are reading the atime from the Linux inode because the + * dinode might not be uptodate. + */ + buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; + buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; -- cgit v1.2.3-59-g8ed1b From cb4c8cc1e92bc68c952e9a81a9fb9736bd8150de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2009 08:25:25 +0100 Subject: xfs: kill VN_BAD Remove this rather pointless wrapper and use is_bad_inode directly. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_vnode.h | 8 -------- fs/xfs/xfs_vnodeops.c | 4 ++-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index de9dc747b451..ad7fbead4c97 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -49,14 +49,6 @@ struct attrlist_cursor_kern; Prevent VM access to the pages until the operation completes. */ -/* - * Dealing with bad inodes - */ -static inline int VN_BAD(struct inode *vp) -{ - return is_bad_inode(vp); -} - /* * Some useful predicates. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 59de04954bc8..6e28461e7611 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1136,7 +1136,7 @@ xfs_inactive( * If the inode is already free, then there can be nothing * to clean up here. */ - if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) { + if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) { ASSERT(ip->i_df.if_real_bytes == 0); ASSERT(ip->i_df.if_broot_bytes == 0); return VN_INACTIVE_CACHE; @@ -2448,7 +2448,7 @@ xfs_reclaim( ASSERT(!VN_MAPPED(VFS_I(ip))); /* bad inode, get out here ASAP */ - if (VN_BAD(VFS_I(ip))) { + if (is_bad_inode(VFS_I(ip))) { xfs_ireclaim(ip); return 0; } -- cgit v1.2.3-59-g8ed1b From 6cc87645e2a3c28d857b074e90bf88bfcd117afa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 16 Mar 2009 08:29:46 +0100 Subject: xfs: factor out code to find the longest free extent in the AG Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig --- fs/xfs/xfs_alloc.c | 26 +++++++++++++++++++++----- fs/xfs/xfs_alloc.h | 6 ++++++ fs/xfs/xfs_bmap.c | 12 ++---------- fs/xfs/xfs_filestream.c | 9 ++------- 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 028e44e58ea9..2cf944eb796d 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1871,6 +1871,25 @@ xfs_alloc_compute_maxlevels( mp->m_ag_maxlevels = level; } +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. @@ -1923,15 +1942,12 @@ xfs_alloc_fix_freelist( } if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; /* * If it looks like there isn't a long enough extent, or enough * total blocks, reject it. */ - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest || ((int)(pag->pagf_freeblks + pag->pagf_flcount - diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 588172796f7b..e704caee10df 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -100,6 +100,12 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ +/* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 27062d0a879d..07b7d0d59d88 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2712,9 +2712,6 @@ xfs_bmap_btalloc( xfs_agnumber_t startag; xfs_alloc_arg_t args; xfs_extlen_t blen; - xfs_extlen_t delta; - xfs_extlen_t longest; - xfs_extlen_t need; xfs_extlen_t nextminlen = 0; xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ @@ -2796,13 +2793,8 @@ xfs_bmap_btalloc( * See xfs_alloc_fix_freelist... */ if (pag->pagf_init) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? - need - pag->pagf_flcount : 0; - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || - pag->pagf_longest > 0); + xfs_extlen_t longest; + longest = xfs_alloc_longest_free_extent(mp, pag); if (blen < longest) blen = longest; } else diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index f3bb75da384e..6c87c8f304ef 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -140,7 +140,7 @@ _xfs_filestream_pick_ag( xfs_extlen_t minlen) { int err, trylock, nscan; - xfs_extlen_t delta, longest, need, free, minfree, maxfree = 0; + xfs_extlen_t longest, free, minfree, maxfree = 0; xfs_agnumber_t ag, max_ag = NULLAGNUMBER; struct xfs_perag *pag; @@ -186,12 +186,7 @@ _xfs_filestream_pick_ag( goto next_ag; } - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); - + longest = xfs_alloc_longest_free_extent(mp, pag); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || -- cgit v1.2.3-59-g8ed1b From ac99c58c9e56967037382e31f865b72b10127965 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:35 +0100 Subject: tracing/syscalls: fix missing release of tracing Impact: fix 'stuck' syscall tracer The syscall tracer uses a refcounter to enable several users simultaneously. But the refcounter did not behave correctly and always restored its value to 0 after calling start_syscall_tracing(). Therefore, stop_syscall_tracing() couldn't release correctly the tasks from tracing. Also the tracer forgot to reset the buffer when it is released. Drop the pointless refcount decrement on start_syscall_tracing() and reset the buffer when we release the tracer. This fixes two reported issue: - when we switch from syscall tracer to another tracer, syscall tracing continued. - incorrect use of the refcount. Reported-by: Andrew Morton Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1237151439-6755-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c72e599230ff..c5fc1d8880f6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -96,8 +96,9 @@ void start_ftrace_syscalls(void) unsigned long flags; struct task_struct *g, *t; + /* Don't enable the flag on the tasks twice */ if (atomic_inc_return(&refcount) != 1) - goto out; + return; arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); @@ -107,8 +108,6 @@ void start_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); -out: - atomic_dec(&refcount); } void stop_ftrace_syscalls(void) @@ -116,8 +115,9 @@ void stop_ftrace_syscalls(void) unsigned long flags; struct task_struct *g, *t; + /* There are perhaps still some users */ if (atomic_dec_return(&refcount)) - goto out; + return; read_lock_irqsave(&tasklist_lock, flags); @@ -126,8 +126,6 @@ void stop_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); -out: - atomic_inc(&refcount); } void ftrace_syscall_enter(struct pt_regs *regs) @@ -201,6 +199,7 @@ static int init_syscall_tracer(struct trace_array *tr) static void reset_syscall_tracer(struct trace_array *tr) { stop_ftrace_syscalls(); + tracing_reset_online_cpus(tr); } static struct trace_event syscall_enter_event = { -- cgit v1.2.3-59-g8ed1b From 6404434525bb9f8f2239998f30fd7c93f2efa5b3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:36 +0100 Subject: tracing/syscalls: various cleanups Impact: cleanup - Drop unused cpu variable - Fix some errors on comments Signed-off-by: Frederic Weisbecker LKML-Reference: <1237151439-6755-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c5fc1d8880f6..26f9a8679d3d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,7 +7,7 @@ static atomic_t refcount; -/* Our two options */ +/* Option to display the parameters types */ enum { TRACE_SYSCALLS_OPT_TYPES = 0x1, }; @@ -18,7 +18,7 @@ static struct tracer_opt syscalls_opts[] = { }; static struct tracer_flags syscalls_flags = { - .val = 0, /* By default: no args types */ + .val = 0, /* By default: no parameters types */ .opts = syscalls_opts }; @@ -135,12 +135,9 @@ void ftrace_syscall_enter(struct pt_regs *regs) struct ring_buffer_event *event; int size; int syscall_nr; - int cpu; syscall_nr = syscall_get_nr(current, regs); - cpu = raw_smp_processor_id(); - sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; @@ -166,12 +163,9 @@ void ftrace_syscall_exit(struct pt_regs *regs) struct syscall_metadata *sys_data; struct ring_buffer_event *event; int syscall_nr; - int cpu; syscall_nr = syscall_get_nr(current, regs); - cpu = raw_smp_processor_id(); - sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; -- cgit v1.2.3-59-g8ed1b From 5be71b61f17b0e3bc8ad0b1a1b7b53ab7d574ebb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:37 +0100 Subject: tracing/syscalls: protect thread flag toggling from races Impact: fix syscall tracer enable/disable race The current thread flag toggling is racy as shown in the following scenario: - task A is the last user of syscall tracing, it releases the TIF_SYSCALL_FTRACE on each tasks - at the same time task B start syscall tracing. refcount == 0 so it sets up TIF_SYSCALL_FTRACE on each tasks. The effect of the mixup is unpredictable. So this fix adds a mutex on {start,stop}_syscall_tracing(). Reported-by: Andrew Morton Reported-by: Ingo Molnar LKML-Reference: <1237151439-6755-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_syscalls.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 26f9a8679d3d..a2a3af29c943 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -5,7 +5,11 @@ #include "trace_output.h" #include "trace.h" -static atomic_t refcount; +/* Keep a counter of the syscall tracing users */ +static int refcount; + +/* Prevent from races on thread flags toggling */ +static DEFINE_MUTEX(syscall_trace_lock); /* Option to display the parameters types */ enum { @@ -96,9 +100,11 @@ void start_ftrace_syscalls(void) unsigned long flags; struct task_struct *g, *t; + mutex_lock(&syscall_trace_lock); + /* Don't enable the flag on the tasks twice */ - if (atomic_inc_return(&refcount) != 1) - return; + if (++refcount != 1) + goto unlock; arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); @@ -108,6 +114,9 @@ void start_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); + +unlock: + mutex_unlock(&syscall_trace_lock); } void stop_ftrace_syscalls(void) @@ -115,9 +124,11 @@ void stop_ftrace_syscalls(void) unsigned long flags; struct task_struct *g, *t; + mutex_lock(&syscall_trace_lock); + /* There are perhaps still some users */ - if (atomic_dec_return(&refcount)) - return; + if (--refcount) + goto unlock; read_lock_irqsave(&tasklist_lock, flags); @@ -126,6 +137,9 @@ void stop_ftrace_syscalls(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); + +unlock: + mutex_unlock(&syscall_trace_lock); } void ftrace_syscall_enter(struct pt_regs *regs) -- cgit v1.2.3-59-g8ed1b From 0ea1c4156bf9e2eb370cc5c6fa6eb112bd844dec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:38 +0100 Subject: tracing/syscalls: select kallsysms Syscall tracing must select kallsysms. The arch code builds a table to find the syscall metadata by syscall number. It needs the syscalls names resolution from the symbol table to know which name found on the syscalls metadatas match a function pointer from the arch sys_call_table. Reported-by: Andrew Morton Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1237151439-6755-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95a0ad191f19..b0a46f889659 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -182,6 +182,7 @@ config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS select TRACING + select KALLSYMS help Basic tracer to catch the syscall entry and exit events. -- cgit v1.2.3-59-g8ed1b From 59f586db98919d7d9c43527b26c8de1cdf9ed912 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 15 Mar 2009 22:10:39 +0100 Subject: tracing/core: fix missing mutex unlock on tracing_set_tracer() Impact: fix possible locking imbalance In case of ring buffer resize failure, tracing_set_tracer forgot to release trace_types_lock. Fix it. Signed-off-by: Frederic Weisbecker LKML-Reference: <1237151439-6755-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index efe3202c0209..c0cf946d42f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2494,7 +2494,7 @@ static int tracing_set_tracer(const char *buf) if (!ring_buffer_expanded) { ret = tracing_resize_ring_buffer(trace_buf_size); if (ret < 0) - return ret; + goto out; ret = 0; } -- cgit v1.2.3-59-g8ed1b From ac1d52d0b85854958c7e78c8006e39aadb6ce4b8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Mar 2009 00:32:41 +0100 Subject: tracing/ftrace: fix double calls to tracing_start() Impact: fix a warning during preemptirqsoff selftests When the preemptirqsoff selftest fails, we see the following warning: [ 6.050000] Testing tracer preemptirqsoff: .. no entries found .. ------------[ cut here ]------------ [ 6.060000] WARNING: at kernel/trace/trace.c:688 tracing_start+0x67/0xd3() [ 6.060000] Modules linked in: [ 6.060000] Pid: 1, comm: swapper Tainted: G [ 6.060000] Call Trace: [ 6.060000] [] warn_slowpath+0xb1/0x100 [ 6.060000] [] ? trace_preempt_on+0x35/0x4b [ 6.060000] [] ? tracing_start+0x31/0xd3 [ 6.060000] [] ? tracing_start+0x31/0xd3 [ 6.060000] [] ? __lock_acquired+0xe6/0x1f2 [ 6.060000] [] ? tracing_start+0x31/0xd3 [ 6.060000] [] tracing_start+0x67/0xd3 [ 6.060000] [] ? irqsoff_tracer_reset+0x2d/0x57 [ 6.060000] [] trace_selftest_startup_preemptirqsoff+0x1c8/0x1f1 [ 6.060000] [] register_tracer+0x12f/0x241 [ 6.060000] [] ? init_irqsoff_tracer+0x0/0x53 [ 6.060000] [] init_irqsoff_tracer+0x3b/0x53 This is because in fail case, the preemptirqsoff tracer selftest calls twice the tracing_start() function: int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) { if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; tracing_start(); <----- goto out; } [...] out: trace->reset(tr); tracing_start(); <------ tracing_max_latency = save_max; return ret; } Since it is well handled in the out path, we don't need the conditional one. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1237159961-7447-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index f907a2b29028..a2ca6f0fef9b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -414,7 +414,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * ret = tracer_init(trace, tr); if (ret) { warn_failed_init_tracer(trace, ret); - goto out; + goto out_no_start; } /* reset the max latency */ @@ -432,21 +432,16 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); - if (ret) { - tracing_start(); + if (ret) goto out; - } ret = trace_test_buffer(&max_tr, &count); - if (ret) { - tracing_start(); + if (ret) goto out; - } if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); ret = -1; - tracing_start(); goto out; } @@ -475,9 +470,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * goto out; } - out: - trace->reset(tr); +out: tracing_start(); +out_no_start: + trace->reset(tr); tracing_max_latency = save_max; return ret; -- cgit v1.2.3-59-g8ed1b From 2fc1dfbe17e7705c55b7a99da995fa565e26f151 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Mar 2009 01:45:03 +0100 Subject: tracing/core: fix early free of cpumasks Impact: fix crashes when tracing cpumasks While ring-buffer allocation, the cpumasks are allocated too, including the tracing cpumask and the per-cpu file mask handler. But these cpumasks are freed accidentally just after. Fix it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1237164303-11476-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0cf946d42f5..ae32d3b99b4b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4125,7 +4125,8 @@ __init static int tracer_alloc_buffers(void) &trace_panic_notifier); register_die_notifier(&trace_die_notifier); - ret = 0; + + return 0; out_free_cpumask: free_cpumask_var(tracing_reader_cpumask); -- cgit v1.2.3-59-g8ed1b From 0a6fb8d9c435c612171b453449f98da28e9969a5 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sat, 14 Mar 2009 16:35:27 +0200 Subject: UBIFS: fix lprops committing bug When writing lprop nodes, do not forget to set @from to 0 when switching the LEB. This fixes the following bug: UBIFS error (pid 27768): ubifs_leb_write: writing -15456 bytes at 16:15880, error -22 UBIFS error (pid 27768): do_commit: commit failed, error -22 UBIFS warning (pid 27768): ubifs_ro_mode: switched to read-only mode, error -22 Pid: 27768, comm: freespace Not tainted 2.6.29-rc4-ubifs-2.6 #43 Call Trace: [] ubifs_ro_mode+0x54/0x56 [ubifs] [] do_commit+0x4f5/0x50a [ubifs] [] ubifs_run_commit+0xbc/0xdb [ubifs] [] ubifs_budget_space+0x742/0x9ed [ubifs] [] ? __mutex_lock_common+0x361/0x3ae [] ? ubifs_write_begin+0x18d/0x44c [ubifs] [] ubifs_write_begin+0x321/0x44c [ubifs] [] ? trace_hardirqs_on_caller+0x1f/0x14d [] generic_file_buffered_write+0x12f/0x2d9 [] __generic_file_aio_write_nolock+0x261/0x295 [] generic_file_aio_write+0x69/0xc5 [] ubifs_aio_write+0x14c/0x19e [ubifs] [] do_sync_write+0xe7/0x12d [] ? autoremove_wake_function+0x0/0x38 [] ? security_file_permission+0x11/0x13 [] vfs_write+0xab/0x105 [] sys_write+0x47/0x6f [] system_call_fastpath+0x16/0x1b Signed-off-by: Artem Bityutskiy --- fs/ubifs/lpt_commit.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 27c97a1873d5..1bead5a6d80a 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -421,8 +421,7 @@ static int write_cnodes(struct ubifs_info *c) err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; - from = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -480,7 +479,7 @@ static int write_cnodes(struct ubifs_info *c) err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -507,7 +506,7 @@ static int write_cnodes(struct ubifs_info *c) err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); -- cgit v1.2.3-59-g8ed1b From c9927c3ee2d3d14893efd793a2a9ea772ddb4289 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 16 Mar 2009 09:42:03 +0200 Subject: UBIFS: use KERN_CONT Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 4 ++-- fs/ubifs/lpt_commit.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index e975bd82f38b..93f6532eff00 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) "bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - printk("%c", dent->name[i]); + printk(KERN_CONT "%c", dent->name[i]); } - printk("\n"); + printk(KERN_CONT "\n"); break; } diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 1bead5a6d80a..9d77f68b2f8e 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1,4 +1,4 @@ -/* + /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. @@ -1921,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) lnum, offs); err = ubifs_unpack_nnode(c, buf, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { - printk("%d:%d", nnode.nbranch[i].lnum, + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); if (i != UBIFS_LPT_FANOUT - 1) - printk(", "); + printk(KERN_CONT ", "); } - printk("\n"); + printk(KERN_CONT "\n"); break; } case UBIFS_LPT_LTAB: -- cgit v1.2.3-59-g8ed1b From fb1cd01a33ecb8a49d590c034ba146dff80c5597 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 16 Mar 2009 09:56:57 +0200 Subject: UBIFS: introduce a helpful variable This patch introduces a helpful @c->idx_leb_size variable. The patch also fixes some spelling issues and makes comments use "LEB" instead of "eraseblock", which is more correct. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 33 +++++++++++++++------------------ fs/ubifs/sb.c | 1 - fs/ubifs/super.c | 3 +++ fs/ubifs/ubifs.h | 7 +++++-- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index f393620890ee..8cd425b628ee 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c) } /** - * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. * @c: UBIFS file-system description object * - * This function calculates and returns the number of eraseblocks which should - * be kept for index usage. + * This function calculates and returns the number of LEBs which should be kept + * for index usage. */ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) { - int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; + int idx_lebs; long long idx_size; idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have thrice the index size of space reserved */ - idx_size = idx_size + (idx_size << 1); - + idx_size += idx_size << 1; /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' * pair, nor similarly the two variables for the new index size, so we * have to do this costly 64-bit division on fast-path. */ - idx_size += eff_leb_size - 1; - idx_lebs = div_u64(idx_size, eff_leb_size); + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); /* * The index head is not available for the in-the-gaps method, so add an * extra LEB to compensate. @@ -310,15 +307,15 @@ static int can_use_rp(struct ubifs_info *c) * do_budget_space - reserve flash space for index and data growth. * @c: UBIFS file-system description object * - * This function makes sure UBIFS has enough free eraseblocks for index growth - * and data. + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. * * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of * budgeted index space to the size of the current index, multiplies this by 3, - * and makes sure this does not exceed the amount of free eraseblocks. + * and makes sure this does not exceed the amount of free LEBs. * * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might @@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free) * This function calculates amount of free space to report to user-space. * * Because UBIFS may introduce substantial overhead (the index, node headers, - * alignment, wastage at the end of eraseblocks, etc), it cannot report real - * amount of free flash space it has (well, because not all dirty space is - * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, - * it would bread user expectations about what free space is. Users seem to - * accustomed to assume that if the file-system reports N bytes of free space, - * they would be able to fit a file of N bytes to the FS. This almost works for + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for * traditional file-systems, because they have way less overhead than UBIFS. * So, to keep users happy, UBIFS tries to take the overhead into account. */ diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index e070c643d1bb..0dec47c87c6d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -623,7 +623,6 @@ int ubifs_read_superblock(struct ubifs_info *c) c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; c->main_first = c->leb_cnt - c->main_lebs; - c->report_rp_size = ubifs_reported_space(c, c->rp_size); err = validate_sb(c, sup); out: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 03cd9ac4dcb2..7bdd248ec770 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c) if (err) return err; + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } @@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c) long long tmp64; c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2da1193a381f..a53b9a6df2be 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1015,6 +1015,8 @@ struct ubifs_debug_info; * @min_io_shift: number of bits in @min_io_size minus one * @leb_size: logical eraseblock size in bytes * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) * @leb_cnt: count of logical eraseblocks * @max_leb_cnt: maximum count of logical eraseblocks * @old_leb_cnt: count of logical eraseblocks before re-size @@ -1132,8 +1134,8 @@ struct ubifs_debug_info; * previous commit start * @uncat_list: list of un-categorized LEBs * @empty_list: list of empty LEBs - * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) - * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list * * @ltab_lnum: LEB number of LPT's own lprops table @@ -1253,6 +1255,7 @@ struct ubifs_info { int min_io_shift; int leb_size; int half_leb_size; + int idx_leb_size; int leb_cnt; int max_leb_cnt; int old_leb_cnt; -- cgit v1.2.3-59-g8ed1b From 03303549b1695dc024d4a653cc16bd79f78f9750 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Mar 2009 22:41:00 +0100 Subject: tracing/ftrace: fix the check on nopped sites Impact: fix a dynamic tracing failure Recently, the function and function graph tracers failed to use dynamic tracing after the following commit: fa9d13cf135efbd454453a53b6299976bea245a9 (ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED rec) The patch is right except a mistake on the check for the FTRACE_FL_CONVERTED flag. The code patching is aborted in case of successfully nopped sites. What we want is the opposite: ignore the callsites that haven't been nopped. Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 90d5729afeff..7847806eefef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -537,7 +537,7 @@ static void ftrace_replace_code(int enable) */ if (rec->flags & FTRACE_FL_FREE || rec->flags & FTRACE_FL_FAILED || - rec->flags & FTRACE_FL_CONVERTED) + !(rec->flags & FTRACE_FL_CONVERTED)) continue; /* ignore updates to this record's mcount site */ -- cgit v1.2.3-59-g8ed1b From 4ca530852346be239b7c19e7bec5d2b78855bebe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Mar 2009 19:20:15 -0400 Subject: tracing: protect reader of cmdline output Impact: fix to one cause of incorrect comm outputs in trace The spinlock only protected the creation of a comm <=> pid pair. But it was possible that a reader could look up a pid, and get the wrong comm because it had no locking. This also required changing trace_find_cmdline to copy the comm cache and not just send back a pointer to it. Signed-off-by: Steven Rostedt --- kernel/trace/blktrace.c | 23 ++++++++++++++++++----- kernel/trace/trace.c | 20 ++++++++++++-------- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 12 ++++++------ kernel/trace/trace_output.c | 18 ++++++++++++------ 5 files changed, 49 insertions(+), 26 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1f32e4edf490..b171778e3863 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1027,7 +1027,9 @@ static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t, static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { - const char *cmd = trace_find_cmdline(ent->pid); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); if (t_sec(ent)) return trace_seq_printf(s, "%llu + %u [%s]\n", @@ -1057,19 +1059,30 @@ static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) { - return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid)); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s]\n", cmd); } static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) { - return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid), - get_pdu_int(ent)); + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); } static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) { + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), trace_find_cmdline(ent->pid)); + get_pdu_int(ent), cmd); } /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index efe3202c0209..2796bd2b17e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -770,25 +770,29 @@ static void trace_save_cmdline(struct task_struct *tsk) __raw_spin_unlock(&trace_cmdline_lock); } -char *trace_find_cmdline(int pid) +void trace_find_cmdline(int pid, char comm[]) { - char *cmdline = "<...>"; unsigned map; - if (!pid) - return ""; + if (!pid) { + strcpy(comm, ""); + return; + } - if (pid > PID_MAX_DEFAULT) - goto out; + if (pid > PID_MAX_DEFAULT) { + strcpy(comm, "<...>"); + return; + } + __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map >= SAVED_CMDLINES) goto out; - cmdline = saved_cmdlines[map]; + strcpy(comm, saved_cmdlines[map]); out: - return cmdline; + __raw_spin_unlock(&trace_cmdline_lock); } void tracing_record_cmdline(struct task_struct *tsk) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 56ce34d90b03..b0ecad8ecc34 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -547,7 +547,7 @@ struct tracer_switch_ops { }; #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -extern char *trace_find_cmdline(int pid); +extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4c388607ed67..6004ccac2dd7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -190,15 +190,15 @@ print_graph_cpu(struct trace_seq *s, int cpu) static enum print_line_t print_graph_proc(struct trace_seq *s, pid_t pid) { - int i; - int ret; - int len; - char comm[8]; - int spaces = 0; + char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ char pid_str[11]; + int spaces = 0; + int ret; + int len; + int i; - strncpy(comm, trace_find_cmdline(pid), 7); + trace_find_cmdline(pid, comm); comm[7] = '\0'; sprintf(pid_str, "%d", pid); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ea9d3b410c7a..6a4c9dea191e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -309,9 +309,9 @@ static int lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) { int hardirq, softirq; - char *comm; + char comm[TASK_COMM_LEN]; - comm = trace_find_cmdline(entry->pid); + trace_find_cmdline(entry->pid, comm); hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -346,10 +346,12 @@ int trace_print_context(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - char *comm = trace_find_cmdline(entry->pid); unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", comm, entry->pid, iter->cpu, secs, usec_rem); @@ -372,7 +374,10 @@ int trace_print_lat_context(struct trace_iterator *iter) rel_usecs = ns2usecs(next_ts - iter->ts); if (verbose) { - char *comm = trace_find_cmdline(entry->pid); + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, entry->pid, iter->cpu, entry->flags, @@ -577,14 +582,15 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) { struct ctx_switch_entry *field; - char *comm; + char comm[TASK_COMM_LEN]; int S, T; + trace_assign_type(field, iter->ent); T = task_state_char(field->next_state); S = task_state_char(field->prev_state); - comm = trace_find_cmdline(field->next_pid); + trace_find_cmdline(field->next_pid, comm); if (!trace_seq_printf(&iter->seq, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", field->prev_pid, -- cgit v1.2.3-59-g8ed1b From 6adaad14d7d4d3ef31b4e2dc992b18b5da7c4eb3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Mar 2009 21:57:17 -0400 Subject: tracing: stop comm recording on tracing off Impact: fix for losing comms in trace The command lines of tasks are cached at sched switch to not need to record them at every trace point. Disabling the tracing on stops the recording of traces, but does not stop the caching of command lines. When the tracing is off the cache may overflow and cause the tracing to show incorrect tasks matching the PIDs. This patch disables prevents updates to the comm cache when the ring buffer is off. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2796bd2b17e4..8f89690230e6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -797,7 +797,7 @@ void trace_find_cmdline(int pid, char comm[]) void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled)) + if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) return; trace_save_cmdline(tsk); -- cgit v1.2.3-59-g8ed1b From ff69f2bba67bd45514923aaedbf40fe351787c59 Mon Sep 17 00:00:00 2001 From: "alex.shi" Date: Wed, 4 Mar 2009 11:55:26 -0800 Subject: acpi: fix of pmtimer overflow that make Cx states time incorrect We found Cx states time abnormal in our some of machines which have 16 LCPUs, the C0 take too many time while system is really idle when kernel enabled tickless and highres. powertop output is below: PowerTOP version 1.9 (C) 2007 Intel Corporation Cn Avg residency P-states (frequencies) C0 (cpu running) (40.5%) 2.53 Ghz 0.0% C1 0.0ms ( 0.0%) 2.53 Ghz 0.0% C2 128.8ms (59.5%) 2.40 Ghz 0.0% 1.60 Ghz 100.0% Wakeups-from-idle per second : 4.7 interval: 20.0s no ACPI power usage estimate available Top causes for wakeups: 41.4% ( 24.9) : extra timer interrupt 20.2% ( 12.2) : usb_hcd_poll_rh_status (rh_timer_func) After tacking detailed for this issue, Yakui and I find it is due to 24 bit PM timer overflows when some of cpu sleep more than 4 seconds. With tickless kernel, the CPU want to sleep as much as possible when system idle. But the Cx sleep time are recorded by pmtimer which length is determined by BIOS. The current Cx time was gotten in the following function from driver/acpi/processor_idle.c: static inline u32 ticks_elapsed(u32 t1, u32 t2) { if (t2 >= t1) return (t2 - t1); else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); else return ((0xFFFFFFFF - t1) + t2); } If pmtimer is 24 bits and it take 5 seconds from t1 to t2, in above function, just about 1 seconds ticks was recorded. So the Cx time will be reduced about 4 seconds. and this is why we see above powertop output. To resolve this problem, Yakui and I use ktime_get() to record the Cx states time instead of PM timer as the following patch. the patch was tested with i386/x86_64 modes on several platforms. Acked-by: Venkatesh Pallipadi Tested-by: Alex Shi Signed-off-by: Alex Shi Signed-off-by: Yakui.zhao Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- drivers/acpi/processor_idle.c | 63 +++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 7bc22a471fe3..879af875c213 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -64,7 +64,6 @@ #define _COMPONENT ACPI_PROCESSOR_COMPONENT ACPI_MODULE_NAME("processor_idle"); #define ACPI_PROCESSOR_FILE_POWER "power" -#define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) #define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) #define C2_OVERHEAD 1 /* 1us */ #define C3_OVERHEAD 1 /* 1us */ @@ -78,6 +77,10 @@ module_param(nocst, uint, 0000); static unsigned int latency_factor __read_mostly = 2; module_param(latency_factor, uint, 0644); +static s64 us_to_pm_timer_ticks(s64 t) +{ + return div64_u64(t * PM_TIMER_FREQUENCY, 1000000); +} /* * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3. * For now disable this. Probably a bug somewhere else. @@ -159,25 +162,6 @@ static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = { {}, }; -static inline u32 ticks_elapsed(u32 t1, u32 t2) -{ - if (t2 >= t1) - return (t2 - t1); - else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) - return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); - else - return ((0xFFFFFFFF - t1) + t2); -} - -static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2) -{ - if (t2 >= t1) - return PM_TIMER_TICKS_TO_US(t2 - t1); - else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER)) - return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); - else - return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2); -} /* * Callers should disable interrupts before the call and enable @@ -853,7 +837,8 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) static int acpi_idle_enter_c1(struct cpuidle_device *dev, struct cpuidle_state *state) { - u32 t1, t2; + ktime_t kt1, kt2; + s64 idle_time; struct acpi_processor *pr; struct acpi_processor_cx *cx = cpuidle_get_statedata(state); @@ -871,14 +856,15 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev, return 0; } - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt1 = ktime_get_real(); acpi_idle_do_entry(cx); - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt2 = ktime_get_real(); + idle_time = ktime_to_us(ktime_sub(kt2, kt1)); local_irq_enable(); cx->usage++; - return ticks_elapsed_in_us(t1, t2); + return idle_time; } /** @@ -891,8 +877,9 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, { struct acpi_processor *pr; struct acpi_processor_cx *cx = cpuidle_get_statedata(state); - u32 t1, t2; - int sleep_ticks = 0; + ktime_t kt1, kt2; + s64 idle_time; + s64 sleep_ticks = 0; pr = __get_cpu_var(processors); @@ -925,18 +912,19 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, if (cx->type == ACPI_STATE_C3) ACPI_FLUSH_CPU_CACHE(); - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt1 = ktime_get_real(); /* Tell the scheduler that we are going deep-idle: */ sched_clock_idle_sleep_event(); acpi_idle_do_entry(cx); - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt2 = ktime_get_real(); + idle_time = ktime_to_us(ktime_sub(kt2, kt1)); #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86) /* TSC could halt in idle, so notify users */ if (tsc_halts_in_c(cx->type)) mark_tsc_unstable("TSC halts in idle");; #endif - sleep_ticks = ticks_elapsed(t1, t2); + sleep_ticks = us_to_pm_timer_ticks(idle_time); /* Tell the scheduler how much we idled: */ sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); @@ -948,7 +936,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev, acpi_state_timer_broadcast(pr, cx, 0); cx->time += sleep_ticks; - return ticks_elapsed_in_us(t1, t2); + return idle_time; } static int c3_cpu_count; @@ -966,8 +954,10 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, { struct acpi_processor *pr; struct acpi_processor_cx *cx = cpuidle_get_statedata(state); - u32 t1, t2; - int sleep_ticks = 0; + ktime_t kt1, kt2; + s64 idle_time; + s64 sleep_ticks = 0; + pr = __get_cpu_var(processors); @@ -1034,9 +1024,10 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, ACPI_FLUSH_CPU_CACHE(); } - t1 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt1 = ktime_get_real(); acpi_idle_do_entry(cx); - t2 = inl(acpi_gbl_FADT.xpm_timer_block.address); + kt2 = ktime_get_real(); + idle_time = ktime_to_us(ktime_sub(kt2, kt1)); /* Re-enable bus master arbitration */ if (pr->flags.bm_check && pr->flags.bm_control) { @@ -1051,7 +1042,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, if (tsc_halts_in_c(ACPI_STATE_C3)) mark_tsc_unstable("TSC halts in idle"); #endif - sleep_ticks = ticks_elapsed(t1, t2); + sleep_ticks = us_to_pm_timer_ticks(idle_time); /* Tell the scheduler how much we idled: */ sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); @@ -1062,7 +1053,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, acpi_state_timer_broadcast(pr, cx, 0); cx->time += sleep_ticks; - return ticks_elapsed_in_us(t1, t2); + return idle_time; } struct cpuidle_driver acpi_idle_driver = { -- cgit v1.2.3-59-g8ed1b From 1c9ca3a7d41b5db884033900b539b9aeb61a399e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Feb 2009 14:00:40 -0700 Subject: ACPI: pci_link: clean up whitespace This patch makes whitespace and indentation more consistent. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/pci_link.c | 73 ++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 46 deletions(-) diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 6c772ca76bd1..19bc3bcda602 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -43,13 +43,14 @@ #include #include -#define _COMPONENT ACPI_PCI_COMPONENT +#define _COMPONENT ACPI_PCI_COMPONENT ACPI_MODULE_NAME("pci_link"); #define ACPI_PCI_LINK_CLASS "pci_irq_routing" #define ACPI_PCI_LINK_DEVICE_NAME "PCI Interrupt Link" #define ACPI_PCI_LINK_FILE_INFO "info" #define ACPI_PCI_LINK_FILE_STATUS "state" -#define ACPI_PCI_LINK_MAX_POSSIBLE 16 +#define ACPI_PCI_LINK_MAX_POSSIBLE 16 + static int acpi_pci_link_add(struct acpi_device *device); static int acpi_pci_link_remove(struct acpi_device *device, int type); @@ -66,7 +67,7 @@ static struct acpi_driver acpi_pci_link_driver = { .ops = { .add = acpi_pci_link_add, .remove = acpi_pci_link_remove, - }, + }, }; /* @@ -76,7 +77,7 @@ static struct acpi_driver acpi_pci_link_driver = { struct acpi_pci_link_irq { u8 active; /* Current IRQ */ u8 triggering; /* All IRQs */ - u8 polarity; /* All IRQs */ + u8 polarity; /* All IRQs */ u8 resource_type; u8 possible_count; u8 possible[ACPI_PCI_LINK_MAX_POSSIBLE]; @@ -85,15 +86,15 @@ struct acpi_pci_link_irq { }; struct acpi_pci_link { - struct list_head node; - struct acpi_device *device; - struct acpi_pci_link_irq irq; - int refcnt; + struct list_head node; + struct acpi_device *device; + struct acpi_pci_link_irq irq; + int refcnt; }; static struct { - int count; - struct list_head entries; + int count; + struct list_head entries; } acpi_link; static DEFINE_MUTEX(acpi_link_lock); @@ -104,13 +105,12 @@ static DEFINE_MUTEX(acpi_link_lock); /* * set context (link) possible list from resource list */ -static acpi_status -acpi_pci_link_check_possible(struct acpi_resource *resource, void *context) +static acpi_status acpi_pci_link_check_possible(struct acpi_resource *resource, + void *context) { struct acpi_pci_link *link = context; u32 i = 0; - switch (resource->type) { case ACPI_RESOURCE_TYPE_START_DEPENDENT: case ACPI_RESOURCE_TYPE_END_TAG: @@ -179,7 +179,6 @@ static int acpi_pci_link_get_possible(struct acpi_pci_link *link) { acpi_status status; - if (!link) return -EINVAL; @@ -197,12 +196,11 @@ static int acpi_pci_link_get_possible(struct acpi_pci_link *link) return 0; } -static acpi_status -acpi_pci_link_check_current(struct acpi_resource *resource, void *context) +static acpi_status acpi_pci_link_check_current(struct acpi_resource *resource, + void *context) { int *irq = (int *)context; - switch (resource->type) { case ACPI_RESOURCE_TYPE_START_DEPENDENT: case ACPI_RESOURCE_TYPE_END_TAG: @@ -316,7 +314,6 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq) } *resource; struct acpi_buffer buffer = { 0, NULL }; - if (!link || !irq) return -EINVAL; @@ -479,10 +476,10 @@ static int acpi_irq_penalty[ACPI_MAX_IRQS] = { PIRQ_PENALTY_PCI_AVAILABLE, /* IRQ9 PCI, often acpi */ PIRQ_PENALTY_PCI_AVAILABLE, /* IRQ10 PCI */ PIRQ_PENALTY_PCI_AVAILABLE, /* IRQ11 PCI */ - PIRQ_PENALTY_ISA_USED, /* IRQ12 mouse */ - PIRQ_PENALTY_ISA_USED, /* IRQ13 fpe, sometimes */ - PIRQ_PENALTY_ISA_USED, /* IRQ14 ide0 */ - PIRQ_PENALTY_ISA_USED, /* IRQ15 ide1 */ + PIRQ_PENALTY_ISA_USED, /* IRQ12 mouse */ + PIRQ_PENALTY_ISA_USED, /* IRQ13 fpe, sometimes */ + PIRQ_PENALTY_ISA_USED, /* IRQ14 ide0 */ + PIRQ_PENALTY_ISA_USED, /* IRQ15 ide1 */ /* >IRQ15 */ }; @@ -492,12 +489,10 @@ int __init acpi_irq_penalty_init(void) struct acpi_pci_link *link = NULL; int i = 0; - /* * Update penalties to facilitate IRQ balancing. */ list_for_each(node, &acpi_link.entries) { - link = list_entry(node, struct acpi_pci_link, node); if (!link) { printk(KERN_ERR PREFIX "Invalid link context\n"); @@ -527,7 +522,6 @@ int __init acpi_irq_penalty_init(void) } /* Add a penalty for the SCI */ acpi_irq_penalty[acpi_gbl_FADT.sci_interrupt] += PIRQ_PENALTY_PCI_USING; - return 0; } @@ -538,7 +532,6 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) int irq; int i; - if (link->irq.initialized) { if (link->refcnt == 0) /* This means the link is disabled but initialized */ @@ -566,11 +559,10 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) /* * if active found, use it; else pick entry from end of possible list. */ - if (link->irq.active) { + if (link->irq.active) irq = link->irq.active; - } else { + else irq = link->irq.possible[link->irq.possible_count - 1]; - } if (acpi_irq_balance || !link->irq.active) { /* @@ -599,7 +591,6 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) } link->irq.initialized = 1; - return 0; } @@ -608,17 +599,13 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) * success: return IRQ >= 0 * failure: return -1 */ - -int -acpi_pci_link_allocate_irq(acpi_handle handle, - int index, - int *triggering, int *polarity, char **name) +int acpi_pci_link_allocate_irq(acpi_handle handle, int index, int *triggering, + int *polarity, char **name) { int result = 0; struct acpi_device *device = NULL; struct acpi_pci_link *link = NULL; - result = acpi_bus_get_device(handle, &device); if (result) { printk(KERN_ERR PREFIX "Invalid link device\n"); @@ -673,7 +660,6 @@ int acpi_pci_link_free_irq(acpi_handle handle) struct acpi_pci_link *link = NULL; acpi_status result; - result = acpi_bus_get_device(handle, &device); if (result) { printk(KERN_ERR PREFIX "Invalid link device\n"); @@ -708,9 +694,9 @@ int acpi_pci_link_free_irq(acpi_handle handle) "Link %s is dereferenced\n", acpi_device_bid(link->device))); - if (link->refcnt == 0) { + if (link->refcnt == 0) acpi_evaluate_object(link->device->handle, "_DIS", NULL, NULL); - } + mutex_unlock(&acpi_link_lock); return (link->irq.active); } @@ -726,7 +712,6 @@ static int acpi_pci_link_add(struct acpi_device *device) int i = 0; int found = 0; - if (!device) return -EINVAL; @@ -784,11 +769,10 @@ static int acpi_pci_link_add(struct acpi_device *device) static int acpi_pci_link_resume(struct acpi_pci_link *link) { - if (link->refcnt && link->irq.active && link->irq.initialized) return (acpi_pci_link_set(link, link->irq.active)); - else - return 0; + + return 0; } static int irqrouter_resume(struct sys_device *dev) @@ -811,7 +795,6 @@ static int acpi_pci_link_remove(struct acpi_device *device, int type) { struct acpi_pci_link *link = NULL; - if (!device || !acpi_driver_data(device)) return -EINVAL; @@ -822,7 +805,6 @@ static int acpi_pci_link_remove(struct acpi_device *device, int type) mutex_unlock(&acpi_link_lock); kfree(link); - return 0; } @@ -931,7 +913,6 @@ static int __init irqrouter_init_sysfs(void) { int error; - if (acpi_disabled || acpi_noirq) return 0; -- cgit v1.2.3-59-g8ed1b From c9d6244329c8149312dba27e78dc4a83b35a6ae5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Feb 2009 14:00:45 -0700 Subject: ACPI: pci_link: remove unnecessary casts and initializations Remove unnecessary casts and initializations. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/pci_link.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 19bc3bcda602..b59f59efb72f 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -109,7 +109,7 @@ static acpi_status acpi_pci_link_check_possible(struct acpi_resource *resource, void *context) { struct acpi_pci_link *link = context; - u32 i = 0; + u32 i; switch (resource->type) { case ACPI_RESOURCE_TYPE_START_DEPENDENT: @@ -199,7 +199,7 @@ static int acpi_pci_link_get_possible(struct acpi_pci_link *link) static acpi_status acpi_pci_link_check_current(struct acpi_resource *resource, void *context) { - int *irq = (int *)context; + int *irq = context; switch (resource->type) { case ACPI_RESOURCE_TYPE_START_DEPENDENT: @@ -256,7 +256,7 @@ static acpi_status acpi_pci_link_check_current(struct acpi_resource *resource, static int acpi_pci_link_get_current(struct acpi_pci_link *link) { int result = 0; - acpi_status status = AE_OK; + acpi_status status; int irq = 0; if (!link) @@ -306,8 +306,8 @@ static int acpi_pci_link_get_current(struct acpi_pci_link *link) static int acpi_pci_link_set(struct acpi_pci_link *link, int irq) { - int result = 0; - acpi_status status = AE_OK; + int result; + acpi_status status; struct { struct acpi_resource res; struct acpi_resource end; @@ -485,9 +485,9 @@ static int acpi_irq_penalty[ACPI_MAX_IRQS] = { int __init acpi_irq_penalty_init(void) { - struct list_head *node = NULL; - struct acpi_pci_link *link = NULL; - int i = 0; + struct list_head *node; + struct acpi_pci_link *link; + int i; /* * Update penalties to facilitate IRQ balancing. @@ -602,9 +602,9 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) int acpi_pci_link_allocate_irq(acpi_handle handle, int index, int *triggering, int *polarity, char **name) { - int result = 0; - struct acpi_device *device = NULL; - struct acpi_pci_link *link = NULL; + int result; + struct acpi_device *device; + struct acpi_pci_link *link; result = acpi_bus_get_device(handle, &device); if (result) { @@ -656,8 +656,8 @@ int acpi_pci_link_allocate_irq(acpi_handle handle, int index, int *triggering, */ int acpi_pci_link_free_irq(acpi_handle handle) { - struct acpi_device *device = NULL; - struct acpi_pci_link *link = NULL; + struct acpi_device *device; + struct acpi_pci_link *link; acpi_status result; result = acpi_bus_get_device(handle, &device); @@ -707,9 +707,9 @@ int acpi_pci_link_free_irq(acpi_handle handle) static int acpi_pci_link_add(struct acpi_device *device) { - int result = 0; - struct acpi_pci_link *link = NULL; - int i = 0; + int result; + struct acpi_pci_link *link; + int i; int found = 0; if (!device) @@ -777,8 +777,8 @@ static int acpi_pci_link_resume(struct acpi_pci_link *link) static int irqrouter_resume(struct sys_device *dev) { - struct list_head *node = NULL; - struct acpi_pci_link *link = NULL; + struct list_head *node; + struct acpi_pci_link *link; list_for_each(node, &acpi_link.entries) { link = list_entry(node, struct acpi_pci_link, node); @@ -793,7 +793,7 @@ static int irqrouter_resume(struct sys_device *dev) static int acpi_pci_link_remove(struct acpi_device *device, int type) { - struct acpi_pci_link *link = NULL; + struct acpi_pci_link *link; if (!device || !acpi_driver_data(device)) return -EINVAL; -- cgit v1.2.3-59-g8ed1b From 6eca4b4ca168981d7648be371945c2a21f463a45 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Feb 2009 14:00:50 -0700 Subject: ACPI: pci_link: remove unnecessary null pointer checks Better to oops and learn about a bug than to silently cover it up. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/pci_link.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index b59f59efb72f..dd9ebb9fda42 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -179,9 +179,6 @@ static int acpi_pci_link_get_possible(struct acpi_pci_link *link) { acpi_status status; - if (!link) - return -EINVAL; - status = acpi_walk_resources(link->device->handle, METHOD_NAME__PRS, acpi_pci_link_check_possible, link); if (ACPI_FAILURE(status)) { @@ -259,9 +256,6 @@ static int acpi_pci_link_get_current(struct acpi_pci_link *link) acpi_status status; int irq = 0; - if (!link) - return -EINVAL; - link->irq.active = 0; /* in practice, status disabled is meaningless, ignore it */ @@ -314,7 +308,7 @@ static int acpi_pci_link_set(struct acpi_pci_link *link, int irq) } *resource; struct acpi_buffer buffer = { 0, NULL }; - if (!link || !irq) + if (!irq) return -EINVAL; resource = kzalloc(sizeof(*resource) + 1, irqs_disabled() ? GFP_ATOMIC: GFP_KERNEL); @@ -712,9 +706,6 @@ static int acpi_pci_link_add(struct acpi_device *device) int i; int found = 0; - if (!device) - return -EINVAL; - link = kzalloc(sizeof(struct acpi_pci_link), GFP_KERNEL); if (!link) return -ENOMEM; @@ -795,9 +786,6 @@ static int acpi_pci_link_remove(struct acpi_device *device, int type) { struct acpi_pci_link *link; - if (!device || !acpi_driver_data(device)) - return -EINVAL; - link = acpi_driver_data(device); mutex_lock(&acpi_link_lock); -- cgit v1.2.3-59-g8ed1b From 5f0dccaa81e239477413d0def1133850530f1bbe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Feb 2009 14:00:55 -0700 Subject: ACPI: pci_link: simplify list of link devices We don't need a struct containing a count and a list_head; a simple list_head is sufficient. The list iterators handle empty lists fine. Furthermore, we don't need to check for null list entries because we only add non-null entries. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/pci_link.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index dd9ebb9fda42..16e0f9d3d17c 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -86,16 +86,13 @@ struct acpi_pci_link_irq { }; struct acpi_pci_link { - struct list_head node; + struct list_head list; struct acpi_device *device; struct acpi_pci_link_irq irq; int refcnt; }; -static struct { - int count; - struct list_head entries; -} acpi_link; +static LIST_HEAD(acpi_link_list); static DEFINE_MUTEX(acpi_link_lock); /* -------------------------------------------------------------------------- @@ -479,19 +476,13 @@ static int acpi_irq_penalty[ACPI_MAX_IRQS] = { int __init acpi_irq_penalty_init(void) { - struct list_head *node; struct acpi_pci_link *link; int i; /* * Update penalties to facilitate IRQ balancing. */ - list_for_each(node, &acpi_link.entries) { - link = list_entry(node, struct acpi_pci_link, node); - if (!link) { - printk(KERN_ERR PREFIX "Invalid link context\n"); - continue; - } + list_for_each_entry(link, &acpi_link_list, list) { /* * reflect the possible and active irqs in the penalty table -- @@ -743,9 +734,7 @@ static int acpi_pci_link_add(struct acpi_device *device) printk("\n"); - /* TBD: Acquire/release lock */ - list_add_tail(&link->node, &acpi_link.entries); - acpi_link.count++; + list_add_tail(&link->list, &acpi_link_list); end: /* disable all links -- to be activated on use */ @@ -768,15 +757,9 @@ static int acpi_pci_link_resume(struct acpi_pci_link *link) static int irqrouter_resume(struct sys_device *dev) { - struct list_head *node; struct acpi_pci_link *link; - list_for_each(node, &acpi_link.entries) { - link = list_entry(node, struct acpi_pci_link, node); - if (!link) { - printk(KERN_ERR PREFIX "Invalid link context\n"); - continue; - } + list_for_each_entry(link, &acpi_link_list, list) { acpi_pci_link_resume(link); } return 0; @@ -789,7 +772,7 @@ static int acpi_pci_link_remove(struct acpi_device *device, int type) link = acpi_driver_data(device); mutex_lock(&acpi_link_lock); - list_del(&link->node); + list_del(&link->list); mutex_unlock(&acpi_link_lock); kfree(link); @@ -926,9 +909,6 @@ static int __init acpi_pci_link_init(void) acpi_irq_balance = 0; } - acpi_link.count = 0; - INIT_LIST_HEAD(&acpi_link.entries); - if (acpi_bus_register_driver(&acpi_pci_link_driver) < 0) return -ENODEV; -- cgit v1.2.3-59-g8ed1b From c686d141c7c668ac186015841a1ccd285a1f3362 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 17 Feb 2009 13:49:10 -0700 Subject: ACPI: PCI: use generic pci_swizzle_interrupt_pin() Use the generic pci_swizzle_interrupt_pin() instead of ACPI-specific code. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 891bdf6679f3..be6b9093f8df 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -319,7 +319,7 @@ static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin) */ bridge = dev->bus->self; while (bridge) { - pin = (((pin - 1) + PCI_SLOT(dev->devfn)) % 4) + 1; + pin = pci_swizzle_interrupt_pin(dev, pin); if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_CARDBUS) { /* PC card has the same IRQ as its cardbridge */ -- cgit v1.2.3-59-g8ed1b From 1c48aa36ef301d7b07674313bae65ef2496801a7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 19 Feb 2009 14:45:47 -0700 Subject: ACPI: update Kconfig help texts (no functional changes) Use "help" (not "---help---") consistently throughout. ACPI can't be a module, so if both ACPI & APM are configured, we use ACPI. Update pointers to ACPI CA and Linux ACPI projects. Replace "Compaq" with "Hewlett-Packard" in the spec developer list. Fix typo in /sys/module path. The user-space daemon is "acpid", not "acpi". Add standard "To compile this driver as a module ..." help text. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/Kconfig | 150 +++++++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 64 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 8a851d0f4384..431f8b439553 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -11,9 +11,9 @@ menuconfig ACPI select PNP select CPU_IDLE default y - ---help--- + help Advanced Configuration and Power Interface (ACPI) support for - Linux requires an ACPI compliant platform (hardware/firmware), + Linux requires an ACPI-compliant platform (hardware/firmware), and assumes the presence of OS-directed configuration and power management (OSPM) software. This option will enlarge your kernel by about 70K. @@ -23,20 +23,19 @@ menuconfig ACPI the Plug-and-Play BIOS specification (PnP BIOS), the MultiProcessor Specification (MPS), and the Advanced Power Management (APM) specification. If both ACPI and APM support - are configured, whichever is loaded first shall be used. + are configured, ACPI is used. - The ACPI SourceForge project contains the latest source code, - documentation, tools, mailing list subscription, and other - information. This project is available at: - + The project home page for the Linux ACPI subsystem is here: + Linux support for ACPI is based on Intel Corporation's ACPI - Component Architecture (ACPI CA). For more information see: - + Component Architecture (ACPI CA). For more information on the + ACPI CA, see: + - ACPI is an open industry specification co-developed by Compaq, - Intel, Microsoft, Phoenix, and Toshiba. The specification is - available at: + ACPI is an open industry specification co-developed by + Hewlett-Packard, Intel, Microsoft, Phoenix, and Toshiba. + The specification is available at: if ACPI @@ -49,14 +48,14 @@ config ACPI_SLEEP config ACPI_PROCFS bool "Deprecated /proc/acpi files" depends on PROC_FS - ---help--- + help For backwards compatibility, this option allows deprecated /proc/acpi/ files to exist, even when they have been replaced by functions in /sys. The deprecated files (and their replacements) include: /proc/acpi/sleep (/sys/power/state) - /proc/acpi/info (/sys/modules/acpi/parameters/acpica_version) + /proc/acpi/info (/sys/module/acpi/parameters/acpica_version) /proc/acpi/dsdt (/sys/firmware/acpi/tables/DSDT) /proc/acpi/fadt (/sys/firmware/acpi/tables/FACP) /proc/acpi/debug_layer (/sys/module/acpi/parameters/debug_layer) @@ -66,11 +65,12 @@ config ACPI_PROCFS and functions which do not yet exist in /sys. Say N to delete /proc/acpi/ files that have moved to /sys/ + config ACPI_PROCFS_POWER bool "Deprecated power /proc/acpi directories" depends on PROC_FS default y - ---help--- + help For backwards compatibility, this option allows deprecated power /proc/acpi/ directories to exist, even when they have been replaced by functions in /sys. @@ -86,19 +86,19 @@ config ACPI_SYSFS_POWER bool "Future power /sys interface" select POWER_SUPPLY default y - ---help--- + help Say N to disable power /sys interface config ACPI_PROC_EVENT bool "Deprecated /proc/acpi/event support" depends on PROC_FS default y - ---help--- - A user-space daemon, acpi, typically read /proc/acpi/event - and handled all ACPI sub-system generated events. + help + A user-space daemon, acpid, typically reads /proc/acpi/event + and handles all ACPI-generated events. - These events are now delivered to user-space via - either the input layer, or as netlink events. + These events are now delivered to user-space either + via the input layer or as netlink events. This build option enables the old code for legacy user-space implementation. After some time, this will @@ -112,10 +112,13 @@ config ACPI_AC depends on X86 default y help - This driver adds support for the AC Adapter object, which indicates - whether a system is on AC, or not. If you have a system that can + This driver supports the AC Adapter object, which indicates + whether a system is on AC or not. If you have a system that can switch between A/C and battery, say Y. + To compile this driver as a module, choose M here: + the module will be called ac. + config ACPI_BATTERY tristate "Battery" depends on X86 @@ -125,15 +128,21 @@ config ACPI_BATTERY /proc/acpi/battery. If you have a mobile system with a battery, say Y. + To compile this driver as a module, choose M here: + the module will be called battery. + config ACPI_BUTTON tristate "Button" depends on INPUT default y help - This driver handles events on the power, sleep and lid buttons. + This driver handles events on the power, sleep, and lid buttons. A daemon reads /proc/acpi/event and perform user-defined actions such as shutting down the system. This is necessary for - software controlled poweroff. + software-controlled poweroff. + + To compile this driver as a module, choose M here: + the module will be called button. config ACPI_VIDEO tristate "Video" @@ -141,38 +150,45 @@ config ACPI_VIDEO depends on INPUT select THERMAL help - This driver implement the ACPI Extensions For Display Adapters + This driver implements the ACPI Extensions For Display Adapters for integrated graphics devices on motherboard, as specified in - ACPI 2.0 Specification, Appendix B, allowing to perform some basic - control like defining the video POST device, retrieving EDID information - or to setup a video output, etc. - Note that this is an ref. implementation only. It may or may not work - for your integrated video device. + ACPI 2.0 Specification, Appendix B. This supports basic operations + such as defining the video POST device, retrieving EDID information, + and setting up a video output. + + To compile this driver as a module, choose M here: + the module will be called video. config ACPI_FAN tristate "Fan" select THERMAL default y help - This driver adds support for ACPI fan devices, allowing user-mode + This driver supports ACPI fan devices, allowing user-mode applications to perform basic fan control (on, off, status). + To compile this driver as a module, choose M here: + the module will be called fan. + config ACPI_DOCK bool "Dock" depends on EXPERIMENTAL help - This driver adds support for ACPI controlled docking stations and removable - drive bays such as the IBM ultrabay or the Dell Module Bay. + This driver supports ACPI-controlled docking stations and removable + drive bays such as the IBM Ultrabay and the Dell Module Bay. config ACPI_PROCESSOR tristate "Processor" select THERMAL default y help - This driver installs ACPI as the idle handler for Linux, and uses - ACPI C2 and C3 processor states to save power, on systems that + This driver installs ACPI as the idle handler for Linux and uses + ACPI C2 and C3 processor states to save power on systems that support it. It is required by several flavors of cpufreq - Performance-state drivers. + performance-state drivers. + + To compile this driver as a module, choose M here: + the module will be called processor. config ACPI_HOTPLUG_CPU bool @@ -186,11 +202,14 @@ config ACPI_THERMAL select THERMAL default y help - This driver adds support for ACPI thermal zones. Most mobile and + This driver supports ACPI thermal zones. Most mobile and some desktop systems support ACPI thermal zones. It is HIGHLY recommended that this option be enabled, as your processor(s) may be damaged without it. + To compile this driver as a module, choose M here: + the module will be called thermal. + config ACPI_NUMA bool "NUMA support" depends on NUMA @@ -218,7 +237,7 @@ config ACPI_BLACKLIST_YEAR int "Disable ACPI for systems before Jan 1st this year" if X86_32 default 0 help - enter a 4-digit year, eg. 2001 to disable ACPI by default + Enter a 4-digit year, e.g., 2001, to disable ACPI by default on platforms with DMI BIOS date before January 1st that year. "acpi=force" can be used to override this mechanism. @@ -249,10 +268,13 @@ config ACPI_PCI_SLOT tristate "PCI slot detection driver" default n help - This driver will attempt to discover all PCI slots in your system, - and creates entries in /sys/bus/pci/slots/. This feature can - help you correlate PCI bus addresses with the physical geography - of your slots. If you are unsure, say N. + This driver creates entries in /sys/bus/pci/slots/ for all PCI + slots in the system. This can help correlate PCI bus addresses, + i.e., segment/bus/device/function tuples, with physical slots in + the system. If you are unsure, say N. + + To compile this driver as a module, choose M here: + the module will be called pci_slot. config X86_PM_TIMER bool "Power Management Timer Support" if EMBEDDED @@ -271,43 +293,43 @@ config X86_PM_TIMER systems require this timer. config ACPI_CONTAINER - tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)" + tristate "Container and Module Devices (EXPERIMENTAL)" depends on EXPERIMENTAL default (ACPI_HOTPLUG_MEMORY || ACPI_HOTPLUG_CPU || ACPI_HOTPLUG_IO) - ---help--- - This allows _physical_ insertion and removal of CPUs and memory. - This can be useful, for example, on NUMA machines that support - ACPI based physical hotplug of nodes, or non-NUMA machines that - support physical cpu/memory hot-plug. + help + This driver supports ACPI Container and Module devices (IDs + ACPI0004, PNP0A05, and PNP0A06). - If one selects "m", this driver can be loaded with - "modprobe container". + This helps support hotplug of nodes, CPUs, and memory. + + To compile this driver as a module, choose M here: + the module will be called container. config ACPI_HOTPLUG_MEMORY tristate "Memory Hotplug" depends on MEMORY_HOTPLUG default n help - This driver adds supports for ACPI Memory Hotplug. This driver - provides support for fielding notifications on ACPI memory - devices (PNP0C80) which represent memory ranges that may be - onlined or offlined during runtime. + This driver supports ACPI memory hotplug. The driver + fields notifications on ACPI memory devices (PNP0C80), + which represent memory ranges that may be onlined or + offlined during runtime. - Enabling this driver assumes that your platform hardware - and firmware have support for hot-plugging physical memory. If - your system does not support physically adding or ripping out - memory DIMMs at some platform defined granularity (individually - or as a bank) at runtime, then you need not enable this driver. + If your hardware and firmware do not support adding or + removing memory devices at runtime, you need not enable + this driver. - If one selects "m," this driver can be loaded using the following - command: - $>modprobe acpi_memhotplug + To compile this driver as a module, choose M here: + the module will be called acpi_memhotplug. config ACPI_SBS tristate "Smart Battery System" depends on X86 help - This driver adds support for the Smart Battery System, another + This driver supports the Smart Battery System, another type of access to battery information, found on some laptops. + To compile this driver as a module, choose M here: + the modules will be called sbs and sbshc. + endif # ACPI -- cgit v1.2.3-59-g8ed1b From e60cc7a6f02598fc23c68a656fe9c263d6531ca0 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 13 Mar 2009 12:08:26 -0600 Subject: ACPI: move private declarations to internal.h A number of things that shouldn't be exposed outside the ACPI core were declared in include/acpi/acpi_drivers.h, where anybody can see them. This patch moves those declarations to a new "internal.h" inside drivers/acpi. Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/bus.c | 2 ++ drivers/acpi/internal.h | 24 ++++++++++++++++++++++++ drivers/acpi/scan.c | 2 ++ drivers/acpi/sleep.c | 2 ++ drivers/acpi/wakeup.c | 2 ++ include/acpi/acpi_drivers.h | 23 ----------------------- 6 files changed, 32 insertions(+), 23 deletions(-) create mode 100644 drivers/acpi/internal.h diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 765fd1c56cd6..2e90410a3035 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -39,6 +39,8 @@ #include #include +#include "internal.h" + #define _COMPONENT ACPI_BUS_COMPONENT ACPI_MODULE_NAME("bus"); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h new file mode 100644 index 000000000000..4aee4a236fc9 --- /dev/null +++ b/drivers/acpi/internal.h @@ -0,0 +1,24 @@ +/* For use by Linux/ACPI infrastructure, not drivers */ + +/* -------------------------------------------------------------------------- + Power Resource + -------------------------------------------------------------------------- */ + +int acpi_device_sleep_wake(struct acpi_device *dev, + int enable, int sleep_state, int dev_state); +int acpi_enable_wakeup_device_power(struct acpi_device *dev, int sleep_state); +int acpi_disable_wakeup_device_power(struct acpi_device *dev); +int acpi_power_get_inferred_state(struct acpi_device *device); +int acpi_power_transition(struct acpi_device *device, int state); +extern int acpi_power_nocheck; + +/* -------------------------------------------------------------------------- + Embedded Controller + -------------------------------------------------------------------------- */ +int acpi_ec_ecdt_probe(void); +int acpi_boot_ec_enable(void); + +/*-------------------------------------------------------------------------- + Suspend/Resume + -------------------------------------------------------------------------- */ +extern int acpi_sleep_init(void); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index c54d7b6c4066..2f04cd1147e9 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -11,6 +11,8 @@ #include +#include "internal.h" + #define _COMPONENT ACPI_BUS_COMPONENT ACPI_MODULE_NAME("scan"); #define STRUCT_TO_INT(s) (*((int*)&s)) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 519266654f06..0f86cf74216b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -21,6 +21,8 @@ #include #include + +#include "internal.h" #include "sleep.h" u8 sleep_states[ACPI_S_STATE_COUNT]; diff --git a/drivers/acpi/wakeup.c b/drivers/acpi/wakeup.c index 2d34806d45dd..3f29fd53e9a6 100644 --- a/drivers/acpi/wakeup.c +++ b/drivers/acpi/wakeup.c @@ -8,6 +8,8 @@ #include #include #include + +#include "internal.h" #include "sleep.h" #define _COMPONENT ACPI_SYSTEM_COMPONENT diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 5fc1bb0f4a90..241d227de6c0 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -98,24 +98,6 @@ int acpi_pci_bind_root(struct acpi_device *device, struct acpi_pci_id *id, struct pci_bus *pci_acpi_scan_root(struct acpi_device *device, int domain, int bus); -/* -------------------------------------------------------------------------- - Power Resource - -------------------------------------------------------------------------- */ - -int acpi_device_sleep_wake(struct acpi_device *dev, - int enable, int sleep_state, int dev_state); -int acpi_enable_wakeup_device_power(struct acpi_device *dev, int sleep_state); -int acpi_disable_wakeup_device_power(struct acpi_device *dev); -int acpi_power_get_inferred_state(struct acpi_device *device); -int acpi_power_transition(struct acpi_device *device, int state); -extern int acpi_power_nocheck; - -/* -------------------------------------------------------------------------- - Embedded Controller - -------------------------------------------------------------------------- */ -int acpi_ec_ecdt_probe(void); -int acpi_boot_ec_enable(void); - /* -------------------------------------------------------------------------- Processor -------------------------------------------------------------------------- */ @@ -165,9 +147,4 @@ static inline void unregister_hotplug_dock_device(acpi_handle handle) } #endif -/*-------------------------------------------------------------------------- - Suspend/Resume - -------------------------------------------------------------------------- */ -extern int acpi_sleep_init(void); - #endif /*__ACPI_DRIVERS_H__*/ -- cgit v1.2.3-59-g8ed1b From c269fc8c537d761f36cb98e637ae934d9331a9d5 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 17 Mar 2009 01:20:59 -0500 Subject: tracing: fix leak in event_format_read() Impact: fix memory leak If event_format_read() exits early due to nonzero ppos, the previous kmalloc doesn't get freed - might as well do the check before the kmalloc and avoid the problem. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237270859.8033.141.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 238ea95a4115..c88227b3b9db 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -378,15 +378,15 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, char *buf; int r; + if (*ppos) + return 0; + s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) return -ENOMEM; trace_seq_init(s); - if (*ppos) - return 0; - /* If any of the first writes fail, so will the show_format. */ trace_seq_printf(s, "name: %s\n", call->name); -- cgit v1.2.3-59-g8ed1b From 1be1cb7b47f0744141ed61cdb25648819ae1a56f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Mar 2009 18:53:18 +0100 Subject: debugobjects: replace static objects when slab cache becomes available Impact: refactor/consolidate object management, prepare for delayed free debugobjects allocates static reference objects to track objects which are initialized or activated before the slab cache becomes available. These static reference objects have to be handled seperately in free_object(). The handling of these objects is in the way of implementing a delayed free functionality. The delayed free is required to avoid callbacks into the mm code from debug_check_no_obj_freed(). Replace the static object references with dynamic ones after the slab cache has been initialized. The static objects are now marked initdata. Signed-off-by: Thomas Gleixner LKML-Reference: <200903162049.58058.nickpiggin@yahoo.com.au> --- lib/debugobjects.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 90e46fa12721..fdcda3dbcd35 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -30,7 +30,7 @@ struct debug_bucket { static struct debug_bucket obj_hash[ODEBUG_HASH_SIZE]; -static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE]; +static struct debug_obj obj_static_pool[ODEBUG_POOL_SIZE] __initdata; static DEFINE_SPINLOCK(pool_lock); @@ -883,6 +883,63 @@ void __init debug_objects_early_init(void) hlist_add_head(&obj_static_pool[i].node, &obj_pool); } +/* + * Convert the statically allocated objects to dynamic ones: + */ +static int debug_objects_replace_static_objects(void) +{ + struct debug_bucket *db = obj_hash; + struct hlist_node *node, *tmp; + struct debug_obj *obj, *new; + HLIST_HEAD(objects); + int i, cnt = 0; + + for (i = 0; i < ODEBUG_POOL_SIZE; i++) { + obj = kmem_cache_zalloc(obj_cache, GFP_KERNEL); + if (!obj) + goto free; + hlist_add_head(&obj->node, &objects); + } + + /* + * When debug_objects_mem_init() is called we know that only + * one CPU is up, so disabling interrupts is enough + * protection. This avoids the lockdep hell of lock ordering. + */ + local_irq_disable(); + + /* Remove the statically allocated objects from the pool */ + hlist_for_each_entry_safe(obj, node, tmp, &obj_pool, node) + hlist_del(&obj->node); + /* Move the allocated objects to the pool */ + hlist_move_list(&objects, &obj_pool); + + /* Replace the active object references */ + for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { + hlist_move_list(&db->list, &objects); + + hlist_for_each_entry(obj, node, &objects, node) { + new = hlist_entry(obj_pool.first, typeof(*obj), node); + hlist_del(&new->node); + /* copy object data */ + *new = *obj; + hlist_add_head(&new->node, &db->list); + cnt++; + } + } + + printk(KERN_DEBUG "ODEBUG: %d of %d active objects replaced\n", cnt, + obj_pool_used); + local_irq_enable(); + return 0; +free: + hlist_for_each_entry_safe(obj, node, tmp, &objects, node) { + hlist_del(&obj->node); + kmem_cache_free(obj_cache, obj); + } + return -ENOMEM; +} + /* * Called after the kmem_caches are functional to setup a dedicated * cache pool, which has the SLAB_DEBUG_OBJECTS flag set. This flag @@ -898,8 +955,11 @@ void __init debug_objects_mem_init(void) sizeof (struct debug_obj), 0, SLAB_DEBUG_OBJECTS, NULL); - if (!obj_cache) + if (!obj_cache || debug_objects_replace_static_objects()) { debug_objects_enabled = 0; - else + if (obj_cache) + kmem_cache_destroy(obj_cache); + printk(KERN_WARNING "ODEBUG: out of memory.\n"); + } else debug_objects_selftest(); } -- cgit v1.2.3-59-g8ed1b From 337fff8b5ed0573ea106491c6de47bd7fe623500 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Mar 2009 10:04:53 +0100 Subject: debugobjects: delay free of internal objects Impact: avoid recursive kfree calls, less slab activity on heavy load debugobjects checks on kfree whether tracked objects are freed. When a tracked object is freed debugobjects frees the internal reference object as well. The debug object slab cache is marked to not recurse into debugobjects when a slab objects is freed, but the recursive call can be problematic versus locking in the memory allocator. Defer the freeing of debug slab objects via schedule_work. The reasons not to use RCU are: 1) rcu makes the data structure larger 2) there is no real need for rcu as nothing references the obj after we freed it 3) under heavy load it is easier to reuse the to be freed objects instead of allocating new objects from the slab. This lowered the slab activity significantly in a heavy load networking test where lots of timers are created/destroyed. The workqueue based delayed free allows us just to put the to be freed objects back into the object pool and reuse them right away. Signed-off-by: Thomas Gleixner LKML-Reference: <200903162049.58058.nickpiggin@yahoo.com.au> --- lib/debugobjects.c | 53 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index fdcda3dbcd35..2755a3bd16a1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -50,6 +50,9 @@ static int debug_objects_enabled __read_mostly static struct debug_obj_descr *descr_test __read_mostly; +static void free_obj_work(struct work_struct *work); +static DECLARE_WORK(debug_obj_work, free_obj_work); + static int __init enable_object_debug(char *str) { debug_objects_enabled = 1; @@ -154,25 +157,51 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) } /* - * Put the object back into the pool or give it back to kmem_cache: + * workqueue function to free objects. */ -static void free_object(struct debug_obj *obj) +static void free_obj_work(struct work_struct *work) { - unsigned long idx = (unsigned long)(obj - obj_static_pool); + struct debug_obj *obj; unsigned long flags; - if (obj_pool_free < ODEBUG_POOL_SIZE || idx < ODEBUG_POOL_SIZE) { - spin_lock_irqsave(&pool_lock, flags); - hlist_add_head(&obj->node, &obj_pool); - obj_pool_free++; - obj_pool_used--; - spin_unlock_irqrestore(&pool_lock, flags); - } else { - spin_lock_irqsave(&pool_lock, flags); - obj_pool_used--; + spin_lock_irqsave(&pool_lock, flags); + while (obj_pool_free > ODEBUG_POOL_SIZE) { + obj = hlist_entry(obj_pool.first, typeof(*obj), node); + hlist_del(&obj->node); + obj_pool_free--; + /* + * We release pool_lock across kmem_cache_free() to + * avoid contention on pool_lock. + */ spin_unlock_irqrestore(&pool_lock, flags); kmem_cache_free(obj_cache, obj); + spin_lock_irqsave(&pool_lock, flags); } + spin_unlock_irqrestore(&pool_lock, flags); +} + +/* + * Put the object back into the pool and schedule work to free objects + * if necessary. + */ +static void free_object(struct debug_obj *obj) +{ + unsigned long flags; + int sched = 0; + + spin_lock_irqsave(&pool_lock, flags); + /* + * schedule work when the pool is filled and the cache is + * initialized: + */ + if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) + sched = !work_pending(&debug_obj_work); + hlist_add_head(&obj->node, &obj_pool); + obj_pool_free++; + obj_pool_used--; + spin_unlock_irqrestore(&pool_lock, flags); + if (sched) + schedule_work(&debug_obj_work); } /* -- cgit v1.2.3-59-g8ed1b From f2d28a2ebcb525a6ec7e2152106ddb385ef52b73 Mon Sep 17 00:00:00 2001 From: Guillaume Knispel Date: Tue, 17 Mar 2009 16:18:42 +0100 Subject: printk: correct the behavior of printk_timed_ratelimit() Impact: fix jiffies-comparison sign-wrap behavior The behavior provided by printk_timed_ratelimit() is, in some situations, probably not what a caller would reasonably expect: bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); return true; } return false; } On a 32 bit computer, if printk_timed_ratelimit() is initially called at time jiffies == Ja, *caller_jiffies is set to Ja + msecs_to_jiffies(interval_msecs): let's say Ja + 42 for this example. If this caller then doesn't call printk_timed_ratelimit() until jiffies == Ja + (1 << 31) + 42 (which can happen as soon as ~ 25 days later on a 1000 HZ system), printk_timed_ratelimit() will then always return false to this caller until jiffies loops completely (1 << 31 more ticks). Ths change makes it only return false if jiffies is in the small time window starting at the previous call when true was returned and ending interval_msecs later. Note that if jiffies loops completely between two calls to printk_timed_ratelimit(), it will obviously still wrongly return false, but this is something with a low probability. If something completely reliable is needed I guess jiffies_64 must be used (which this change does not do). Signed-off-by: Guillaume Knispel Cc: Ulrich Drepper Cc: Rusty Russell Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <20090317161842.0059096b@xilun.lan.proformatique.com> Signed-off-by: Ingo Molnar --- kernel/printk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/printk.c b/kernel/printk.c index e3602d0755b0..2be719908d1f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1292,8 +1292,11 @@ EXPORT_SYMBOL(printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { - *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); + if (*caller_jiffies == 0 + || !time_in_range(jiffies, *caller_jiffies, + *caller_jiffies + + msecs_to_jiffies(interval_msecs))) { + *caller_jiffies = jiffies; return true; } return false; -- cgit v1.2.3-59-g8ed1b From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2009 17:22:06 -0400 Subject: ring-buffer: add api to allow a tracer to change clock source This patch adds a new function called ring_buffer_set_clock that allows a tracer to assign its own clock source to the buffer. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 7 +++-- kernel/trace/ring_buffer.c | 65 ++++++++++++++++++++++++++------------------- kernel/trace/trace.c | 21 ++++++++++----- 3 files changed, 57 insertions(+), 36 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b1a0068a5557..9e6052bd1a1c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -118,8 +118,11 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); -u64 ring_buffer_time_stamp(int cpu); -void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts); +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)); size_t ring_buffer_page_len(void *page); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 58128ad2fde0..bbf51922a8ca 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #include "trace.h" -/* Up this if you want to test the TIME_EXTENTS and normalization */ -#define DEBUG_SHIFT 0 - -u64 ring_buffer_time_stamp(int cpu) -{ - u64 time; - - preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = trace_clock_local() << DEBUG_SHIFT; - preempt_enable_no_resched_notrace(); - - return time; -} -EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); - -void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) -{ - /* Just stupid testing the normalize function and deltas */ - *ts >>= DEBUG_SHIFT; -} -EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA 28 @@ -374,6 +351,7 @@ struct ring_buffer { #ifdef CONFIG_HOTPLUG_CPU struct notifier_block cpu_notify; #endif + u64 (*clock)(void); }; struct ring_buffer_iter { @@ -394,6 +372,30 @@ struct ring_buffer_iter { _____ret; \ }) +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +{ + u64 time; + + preempt_disable_notrace(); + /* shift to debug/test normalization and TIME_EXTENTS */ + time = buffer->clock() << DEBUG_SHIFT; + preempt_enable_no_resched_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + /** * check_pages - integrity check of buffer pages * @cpu_buffer: CPU buffer with pages to test @@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; + buffer->clock = trace_clock_local; /* need at least two pages */ if (buffer->pages == 1) @@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_free); +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); static void @@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(cpu_buffer->cpu); + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->cpu); + ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. @@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = cpu_buffer->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; @@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_DATA: if (ts) { *ts = iter->read_stamp + event->time_delta; - ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); } return event; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f89690230e6..3be2f788e10d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec) return nsec; } -cycle_t ftrace_now(int cpu) -{ - u64 ts = ring_buffer_time_stamp(cpu); - ring_buffer_normalize_time_stamp(cpu, &ts); - return ts; -} - /* * The global_trace is the descriptor that holds the tracing * buffers for the live tracing. For each CPU, it contains @@ -178,6 +171,20 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +cycle_t ftrace_now(int cpu) +{ + u64 ts; + + /* Early boot up does not have a buffer yet */ + if (!global_trace.buffer) + return trace_clock_local(); + + ts = ring_buffer_time_stamp(global_trace.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + + return ts; +} + /* * The max_tr is used to snapshot the global_trace when a maximum * latency is reached. Some tracers will use this to store a maximum -- cgit v1.2.3-59-g8ed1b From af4617bdba34aa556272b34c3986b0a4d588f568 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2009 18:09:55 -0400 Subject: tracing: add global-clock option to provide cross CPU clock to traces Impact: feature to allow better serialized clock This patch adds an option called "global-clock" that will allow the tracer to switch to a slower but more accurate (across CPUs) clock. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 1 + 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3be2f788e10d..2f994caab0b7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -315,6 +315,7 @@ static const char *trace_options[] = { "printk-msg-only", "context-info", "latency-format", + "global-clock", NULL }; @@ -2251,6 +2252,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return 0; } +static void set_tracer_flags(unsigned int mask, int enabled) +{ + /* do nothing if flag is already set */ + if (!!(trace_flags & mask) == !!enabled) + return; + + if (enabled) + trace_flags |= mask; + else + trace_flags &= ~mask; + + if (mask == TRACE_ITER_GLOBAL_CLK) { + u64 (*func)(void); + + if (enabled) + func = trace_clock_global; + else + func = trace_clock_local; + + mutex_lock(&trace_types_lock); + ring_buffer_set_clock(global_trace.buffer, func); + + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, func); + mutex_unlock(&trace_types_lock); + } +} + static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -2278,10 +2307,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, int len = strlen(trace_options[i]); if (strncmp(cmp, trace_options[i], len) == 0) { - if (neg) - trace_flags &= ~(1 << i); - else - trace_flags |= (1 << i); + set_tracer_flags(1 << i, !neg); break; } } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b0ecad8ecc34..26a7a28ca110 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,6 +667,7 @@ enum trace_iterator_flags { TRACE_ITER_PRINTK_MSGONLY = 0x10000, TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, + TRACE_ITER_GLOBAL_CLK = 0x80000, }; /* -- cgit v1.2.3-59-g8ed1b From 97e7e4f391cac2b00417b581b432533d245d4fd0 Mon Sep 17 00:00:00 2001 From: Witold Baryluk Date: Tue, 17 Mar 2009 21:15:44 +0100 Subject: tracing: optimization of branch tracer Impact: better performance for if branch tracer Use an array to count the hit and misses of a conditional instead of using another conditional. This cuts down on saturation of branch predictions and increases performance of modern pipelined architectures. Signed-off-by: Witold Baryluk Signed-off-by: Steven Rostedt --- include/linux/compiler.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index d95da1020f1c..6faa7e549de4 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -68,6 +68,7 @@ struct ftrace_branch_data { unsigned long miss; unsigned long hit; }; + unsigned long miss_hit[2]; }; }; @@ -125,10 +126,7 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); .line = __LINE__, \ }; \ ______r = !!(cond); \ - if (______r) \ - ______f.hit++; \ - else \ - ______f.miss++; \ + ______f.miss_hit[______r]++; \ ______r; \ })) #endif /* CONFIG_PROFILE_ALL_BRANCHES */ -- cgit v1.2.3-59-g8ed1b From 5fec6ddcb43a91aa9a254c8ecf174c803de6f07e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2009 19:59:53 -0400 Subject: tracing: make sched_switch stop/start light weight The stopping and starting of a tracer should be light weight and be able to be called in all contexts. The sched_switch grabbed mutexes in the start/stop functions. This patch changes it to a simple variable, on/off. Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 77132c2cf3d9..de35f200abd3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -18,6 +18,7 @@ static struct trace_array *ctx_trace; static int __read_mostly tracer_enabled; static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); +static int sched_stopped; static void probe_sched_switch(struct rq *__rq, struct task_struct *prev, @@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref) + if (!sched_ref || sched_stopped) return; tracing_record_cmdline(prev); @@ -193,6 +194,7 @@ static void stop_sched_trace(struct trace_array *tr) static int sched_switch_trace_init(struct trace_array *tr) { ctx_trace = tr; + tracing_reset_online_cpus(tr); tracing_start_sched_switch_record(); return 0; } @@ -205,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr) static void sched_switch_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); - tracing_start_sched_switch(); + sched_stopped = 0; } static void sched_switch_trace_stop(struct trace_array *tr) { - tracing_stop_sched_switch(); + sched_stopped = 1; } static struct tracer sched_switch_trace __read_mostly = -- cgit v1.2.3-59-g8ed1b From 62524d55e5b9ffe36e3bf3dd7a594114f150b449 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2009 20:58:00 -0400 Subject: tracing: make power tracer start/stop methods lighter weight The start/stop methods of a tracer should be able to be executed in all contexts. This patch converts the power tracer to do so. Signed-off-by: Steven Rostedt --- kernel/trace/trace_power.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 91ce672fb037..bae791ebcc51 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -122,10 +122,14 @@ fail_start: static void start_power_trace(struct trace_array *tr) { trace_power_enabled = 1; - tracing_power_register(); } static void stop_power_trace(struct trace_array *tr) +{ + trace_power_enabled = 0; +} + +static void power_trace_reset(struct trace_array *tr) { trace_power_enabled = 0; unregister_trace_power_start(probe_power_start); @@ -188,7 +192,7 @@ static struct tracer power_tracer __read_mostly = .init = power_trace_init, .start = start_power_trace, .stop = stop_power_trace, - .reset = stop_power_trace, + .reset = power_trace_reset, .print_line = power_print_line, }; -- cgit v1.2.3-59-g8ed1b From 30e1e6d1af2b67558bccf322af2b3e0676b209ae Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 17 Mar 2009 14:50:34 +1030 Subject: cpumask: fix CONFIG_CPUMASK_OFFSTACK=y cpu hotunplug crash Impact: Fix cpu offline when CONFIG_MAXSMP=y Changeset bc9b83dd1f66402b870301c3c7117b9c1484abb4 "cpumask: convert c1e_mask in arch/x86/kernel/process.c to cpumask_var_t" contained a bug: c1e_mask is manipulated even if C1E isn't detected (and hence not allocated). This is simply fixed by checking for NULL (which gcc optimizes out anyway of CONFIG_CPUMASK_OFFSTACK=n, since it knows ce1_mask can never be NULL). In addition, fix a leak where select_idle_routine re-allocates (and re-clears) c1e_mask on every cpu init. Reported-by: Ingo Molnar Signed-off-by: Rusty Russell Cc: Mike Travis LKML-Reference: <200903171450.34549.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 1 + arch/x86/kernel/process.c | 14 +++++++++++--- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d794d9483c56..9874dd98a29f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -733,6 +733,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void init_c1e_mask(void); extern unsigned long boot_option_idle_override; extern unsigned long idle_halt; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82f6cc045ad0..d7dd3c294e2a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -812,6 +812,7 @@ static void vgetcpu_set_mode(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); + init_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6638294cec8d..78533a519d8f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -479,7 +479,8 @@ static int c1e_detected; void c1e_remove_cpu(int cpu) { - cpumask_clear_cpu(cpu, c1e_mask); + if (c1e_mask != NULL) + cpumask_clear_cpu(cpu, c1e_mask); } /* @@ -556,13 +557,20 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) pm_idle = mwait_idle; } else if (check_c1e_idle(c)) { printk(KERN_INFO "using C1E aware idle routine\n"); - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); pm_idle = c1e_idle; } else pm_idle = default_idle; } +void __init init_c1e_mask(void) +{ + /* If we're using c1e_idle, we need to allocate c1e_mask. */ + if (pm_idle == c1e_idle) { + alloc_cpumask_var(&c1e_mask, GFP_KERNEL); + cpumask_clear(c1e_mask); + } +} + static int __init idle_setup(char *str) { if (!str) -- cgit v1.2.3-59-g8ed1b From 18aecd362a1c991fbf5f7919ae051a77532ba2f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Mar 2009 08:56:58 +0100 Subject: tracing: stop command line recording when tracing is disabled Impact: prevent overwrite of command line entries When the tracer is stopped the command line recording continues to record. The check for tracing_is_on() is not sufficient here as the ringbuffer status is not affected by setting debug/tracing/tracing_enabled to 0. On a non idle system this can result in the loss of the command line information for the stopped trace, which makes the trace harder to read and analyse. Check tracer_enabled to allow further recording. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ce6208fd727..7b6043ea256e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -797,7 +797,8 @@ void trace_find_cmdline(int pid, char comm[]) void tracing_record_cmdline(struct task_struct *tsk) { - if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) + if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || + !tracing_is_on()) return; trace_save_cmdline(tsk); -- cgit v1.2.3-59-g8ed1b From 2c7eea4c62ba090b7f4583c3d7337ea0019be900 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Mar 2009 09:03:19 +0100 Subject: tracing: replace the crude (unsigned) -1 hackery Impact: cleanup The command line recorder uses (unsigned) -1 to mark non mapped entries in the pid to command line maps. The validity check is completely unintuitive: idx >= SAVED_CMDLINES There is no need for such casting games. Use a constant to mark unmapped entries and check for that constant to make the code readable and understandable. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7b6043ea256e..ca673c475687 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -633,6 +633,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) } #define SAVED_CMDLINES 128 +#define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; @@ -644,8 +645,8 @@ static atomic_t trace_record_cmdline_disabled __read_mostly; static void trace_init_cmdlines(void) { - memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); - memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); + memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); cmdline_idx = 0; } @@ -753,12 +754,12 @@ static void trace_save_cmdline(struct task_struct *tsk) return; idx = map_pid_to_cmdline[tsk->pid]; - if (idx >= SAVED_CMDLINES) { + if (idx == NO_CMDLINE_MAP) { idx = (cmdline_idx + 1) % SAVED_CMDLINES; map = map_cmdline_to_pid[idx]; - if (map <= PID_MAX_DEFAULT) - map_pid_to_cmdline[map] = (unsigned)-1; + if (map != NO_CMDLINE_MAP) + map_pid_to_cmdline[map] = NO_CMDLINE_MAP; map_pid_to_cmdline[tsk->pid] = idx; @@ -786,7 +787,7 @@ void trace_find_cmdline(int pid, char comm[]) __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; - if (map >= SAVED_CMDLINES) + if (map == NO_CMDLINE_MAP) goto out; strcpy(comm, saved_cmdlines[map]); -- cgit v1.2.3-59-g8ed1b From 50d88758a3f9787cbdbdbc030560b815721eab4b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Mar 2009 08:58:44 +0100 Subject: tracing: fix trace_find_cmdline() Impact: prevent stale command line output In case there is no valid command line mapping for a pid trace_find_cmdline() returns without updating the comm buffer. The trace dump keeps the previous entry which results in confusing trace output: -0 [000] 280.702056 .... -23456 [000] 280.702080 .... Update the comm buffer with "<...>" when no mapping is found. Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca673c475687..06c69a260328 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -787,12 +787,11 @@ void trace_find_cmdline(int pid, char comm[]) __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; - if (map == NO_CMDLINE_MAP) - goto out; - - strcpy(comm, saved_cmdlines[map]); + if (map != NO_CMDLINE_MAP) + strcpy(comm, saved_cmdlines[map]); + else + strcpy(comm, "<...>"); - out: __raw_spin_unlock(&trace_cmdline_lock); } -- cgit v1.2.3-59-g8ed1b From a635cf0497342978d417cae19d4a4823932977ff Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Wed, 18 Mar 2009 09:00:41 +0100 Subject: tracing: fix command line to pid reverse map Impact: fix command line to pid mapping map_cmdline_to_pid[] is checked in trace_save_cmdline(), but never updated. This results in stale pid to command line mappings and the tracer output will associate the wrong comm string. Signed-off-by: Carsten Emde Signed-off-by: Thomas Gleixner Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06c69a260328..305c562dae2a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -738,8 +738,7 @@ void trace_stop_cmdline_recording(void); static void trace_save_cmdline(struct task_struct *tsk) { - unsigned map; - unsigned idx; + unsigned pid, idx; if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) return; @@ -757,10 +756,17 @@ static void trace_save_cmdline(struct task_struct *tsk) if (idx == NO_CMDLINE_MAP) { idx = (cmdline_idx + 1) % SAVED_CMDLINES; - map = map_cmdline_to_pid[idx]; - if (map != NO_CMDLINE_MAP) - map_pid_to_cmdline[map] = NO_CMDLINE_MAP; + /* + * Check whether the cmdline buffer at idx has a pid + * mapped. We are going to overwrite that entry so we + * need to clear the map_pid_to_cmdline. Otherwise we + * would read the new comm for the old pid. + */ + pid = map_cmdline_to_pid[idx]; + if (pid != NO_CMDLINE_MAP) + map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + map_cmdline_to_pid[idx] = tsk->pid; map_pid_to_cmdline[tsk->pid] = idx; cmdline_idx = idx; -- cgit v1.2.3-59-g8ed1b From 490362003457f8d387f6f6e73e3a7efbf56c3314 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Mar 2009 22:38:58 +0100 Subject: tracing/ftrace: stop {irqs, preempt}soff tracers when tracing is stopped Impact: fix a selftest warning In some cases, it's possible to see the following warning on irqsoff tracer selftest: [ 4.640003] Testing tracer irqsoff: <4>------------[ cut here ]------------ [ 4.653562] WARNING: at kernel/trace/trace.c:458 update_max_tr_single+0x9a/0xc4() [ 4.660000] Hardware name: System Product Name [ 4.660000] Modules linked in: [ 4.660000] Pid: 301, comm: kstop/1 Not tainted 2.6.29-rc8-tip #35837 [ 4.660000] Call Trace: [ 4.660000] [<4014b588>] warn_slowpath+0x79/0x8f [ 4.660000] [<402d6949>] ? put_dec+0x64/0x6b [ 4.660000] [<40162b56>] ? getnstimeofday+0x58/0xdd [ 4.660000] [<40162210>] ? clocksource_read+0x3/0xf [ 4.660000] [<4015eb44>] ? ktime_set+0x8/0x34 [ 4.660000] [<4014101a>] ? balance_runtime+0x8/0x56 [ 4.660000] [<405f6f11>] ? _spin_lock+0x3/0x10 [ 4.660000] [<4011f643>] ? ftrace_call+0x5/0x8 [ 4.660000] [<4015d0f1>] ? task_cputime_zero+0x3/0x27 [ 4.660000] [<40190ee7>] ? cpupri_set+0x90/0xcb [ 4.660000] [<405f7208>] ? _spin_lock_irqsave+0x22/0x34 [ 4.660000] [<40190f12>] ? cpupri_set+0xbb/0xcb [ 4.660000] [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35 [ 4.660000] [<4018493f>] ? ring_buffer_reset_cpu+0x27/0x51 [ 4.660000] [<405f7208>] ? _spin_lock_irqsave+0x22/0x34 [ 4.660000] [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51 [ 4.660000] [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35 [ 4.660000] [<4018cc29>] ? trace_hardirqs_off+0x1a/0x1c [ 4.660000] [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35 [ 4.660000] [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51 [ 4.660000] [<401850f3>] ? cpumask_next+0x15/0x18 [ 4.660000] [<4018a41f>] update_max_tr_single+0x9a/0xc4 [ 4.660000] [<4014e5fe>] ? exit_notify+0x16/0xf2 [ 4.660000] [<4018cd13>] check_critical_timing+0xcc/0x11e [ 4.660000] [<4014e5fe>] ? exit_notify+0x16/0xf2 [ 4.660000] [<4014e5fe>] ? exit_notify+0x16/0xf2 [ 4.660000] [<4018cdf1>] stop_critical_timing+0x8c/0x9f [ 4.660000] [<4014e5c4>] ? forget_original_parent+0xac/0xd0 [ 4.660000] [<4018ce3a>] trace_hardirqs_on+0x1a/0x1c [ 4.660000] [<4014e5c4>] forget_original_parent+0xac/0xd0 [ 4.660000] [<4014e5fe>] exit_notify+0x16/0xf2 [ 4.660000] [<4014e8a5>] do_exit+0x1cb/0x225 [ 4.660000] [<4015c72b>] ? kthread+0x0/0x69 [ 4.660000] [<4011f61d>] kernel_thread_helper+0xd/0x10 [ 4.660000] ---[ end trace a7919e7f17c0a725 ]--- [ 4.660164] .. no entries found ..FAILED! During the selftest of irqsoff tracer, we do that: /* disable interrupts for a bit */ local_irq_disable(); udelay(100); local_irq_enable(); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(tr, NULL); If a callsite performs a new max delay with irqs off just after tracing_stop, update_max_tr_single() -> ring_buffer_swap_cpu() will be called with the buffers disabled by tracing_stop(), hence the warning, then ring_buffer_swap_cpu() return -EAGAIN and update_max_tr_single() complains. Fix it by also stopping the tracer before stopping the tracing globally. A similar situation can happen with preemptoff and preemptirqsoff tracers where we apply the same fix. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <1237325938-5240-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a2ca6f0fef9b..38856ba78a92 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -315,6 +315,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) local_irq_disable(); udelay(100); local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -369,6 +377,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) preempt_disable(); udelay(100); preempt_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max preempt off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -428,6 +444,13 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* reverse the order of preempt vs irqs */ local_irq_enable(); + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs/preempt off latencies. + */ + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ @@ -448,6 +471,8 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* do the test by disabling interrupts first this time */ tracing_max_latency = 0; tracing_start(); + trace->start(tr); + preempt_disable(); local_irq_disable(); udelay(100); @@ -455,6 +480,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* reverse the order of preempt vs irqs */ local_irq_enable(); + trace->stop(tr); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ -- cgit v1.2.3-59-g8ed1b From f02b8624fedca39886b0eef770dca70c2f0749b3 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Wed, 18 Mar 2009 17:06:21 +0530 Subject: kprobes: Fix locking imbalance in kretprobes Fix locking imbalance in kretprobes: ===================================== [ BUG: bad unlock balance detected! ] ------------------------------------- kthreadd/2 is trying to release lock (&rp->lock) at: [] pre_handler_kretprobe+0xea/0xf4 but there are no more locks to release! other info that might help us debug this: 1 lock held by kthreadd/2: #0: (rcu_read_lock){..--}, at: [] __atomic_notifier_call_chain+0x0/0x5a stack backtrace: Pid: 2, comm: kthreadd Not tainted 2.6.29-rc8 #1 Call Trace: [] ? printk+0xf/0x17 [] ? pre_handler_kretprobe+0xea/0xf4 [] print_unlock_inbalance_bug+0xc3/0xce [] ? clocksource_read+0x7/0xa [] ? getnstimeofday+0x5f/0xf6 [] ? register_lock_class+0x17/0x293 [] ? mark_lock+0x1e/0x30b [] ? tick_dev_program_event+0x4a/0xbc [] ? __slab_alloc+0xa5/0x415 [] ? pre_handler_kretprobe+0x28/0xf4 [] ? pre_handler_kretprobe+0xea/0xf4 [] lock_release_non_nested+0xa4/0x1a5 [] ? pre_handler_kretprobe+0xea/0xf4 [] lock_release+0x141/0x166 [] _spin_unlock_irqrestore+0x19/0x50 [] pre_handler_kretprobe+0xea/0xf4 [] kprobe_exceptions_notify+0x1c9/0x43e [] notifier_call_chain+0x26/0x48 [] __atomic_notifier_call_chain+0x37/0x5a [] ? __atomic_notifier_call_chain+0x0/0x5a [] atomic_notifier_call_chain+0xc/0xe [] notify_die+0x2d/0x2f [] do_int3+0x1f/0x71 [] int3+0x2c/0x34 [] ? do_fork+0x1/0x288 [] ? kernel_thread+0x71/0x79 [] ? kthread+0x0/0x60 [] ? kthread+0x0/0x60 [] ? kernel_thread_helper+0x0/0x10 [] kthreadd+0xac/0x148 [] ? kthreadd+0x0/0x148 [] kernel_thread_helper+0x7/0x10 Signed-off-by: Ananth N Mavinakayanahalli Tested-by: Bharata B Rao Cc: Masami Hiramatsu Cc: Jim Keniston Cc: Linus Torvalds Cc: Andrew Morton Cc: [2.6.29.x, 2.6.28.x, 2.6.27.x] LKML-Reference: <20090318113621.GB4129@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 479d4d5672f9..5016bfb682b9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -919,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) { - spin_unlock_irqrestore(&rp->lock, flags); + if (rp->entry_handler && rp->entry_handler(ri, regs)) return 0; - } arch_prepare_kretprobe(ri, regs); -- cgit v1.2.3-59-g8ed1b From af5c820a3169e81af869c113e18ec7588836cd50 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Mar 2009 16:32:36 +1030 Subject: x86: cpumask: use work_on_cpu in arch/x86/kernel/microcode_core.c Impact: don't play with current's cpumask Straightforward indirection through work_on_cpu(). One change is that the error code from microcode_update_cpu() is now actually plumbed back to microcode_init_cpu(), so now we printk if it fails on cpu hotplug. Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Dmitry Adamushko Cc: Peter Oruba LKML-Reference: <200903111632.37279.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 106 ++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index c9b721ba968c..9a8dbc000563 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -108,29 +108,40 @@ struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE +struct update_for_cpu { + const void __user *buf; + size_t size; +}; + +static long update_for_cpu(void *_ufc) +{ + struct update_for_cpu *ufc = _ufc; + int error; + + error = microcode_ops->request_microcode_user(smp_processor_id(), + ufc->buf, ufc->size); + if (error < 0) + return error; + if (!error) + microcode_ops->apply_microcode(smp_processor_id()); + return error; +} + static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - - old = current->cpus_allowed; + struct update_for_cpu ufc = { .buf = buf, .size = size }; for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!uci->valid) continue; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); + error = work_on_cpu(cpu, update_for_cpu, &ufc); if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + break; } -out: - set_cpus_allowed_ptr(current, &old); return error; } @@ -205,11 +216,26 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +static long reload_for_cpu(void *unused) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + int err = 0; + + mutex_lock(µcode_mutex); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(smp_processor_id(), + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(smp_processor_id()); + } + mutex_unlock(µcode_mutex); + return err; +} + static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t sz) { - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; char *end; unsigned long val = simple_strtoul(buf, &end, 0); int err = 0; @@ -218,21 +244,9 @@ static ssize_t reload_store(struct sys_device *dev, if (end == buf) return -EINVAL; if (val == 1) { - cpumask_t old = current->cpus_allowed; - get_online_cpus(); - if (cpu_online(cpu)) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - if (uci->valid) { - err = microcode_ops->request_microcode_fw(cpu, - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(cpu); - } - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); - } + if (cpu_online(cpu)) + err = work_on_cpu(cpu, reload_for_cpu, NULL); put_online_cpus(); } if (err) @@ -328,9 +342,9 @@ static int microcode_resume_cpu(int cpu) return 0; } -static void microcode_update_cpu(int cpu) +static long microcode_update_cpu(void *unused) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); int err = 0; /* @@ -338,30 +352,27 @@ static void microcode_update_cpu(int cpu) * otherwise just request a firmware: */ if (uci->valid) { - err = microcode_resume_cpu(cpu); + err = microcode_resume_cpu(smp_processor_id()); } else { - collect_cpu_info(cpu); + collect_cpu_info(smp_processor_id()); if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw(cpu, + err = microcode_ops->request_microcode_fw( + smp_processor_id(), µcode_pdev->dev); } if (!err) - microcode_ops->apply_microcode(cpu); + microcode_ops->apply_microcode(smp_processor_id()); + return err; } -static void microcode_init_cpu(int cpu) +static int microcode_init_cpu(int cpu) { - cpumask_t old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - + int err; mutex_lock(µcode_mutex); - microcode_update_cpu(cpu); + err = work_on_cpu(cpu, microcode_update_cpu, NULL); mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); + return err; } static int mc_sysdev_add(struct sys_device *sys_dev) @@ -379,8 +390,11 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (err) return err; - microcode_init_cpu(cpu); - return 0; + err = microcode_init_cpu(cpu); + if (err) + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + + return err; } static int mc_sysdev_remove(struct sys_device *sys_dev) @@ -404,7 +418,7 @@ static int mc_sysdev_resume(struct sys_device *dev) return 0; /* only CPU 0 will apply ucode here */ - microcode_update_cpu(0); + microcode_update_cpu(NULL); return 0; } @@ -424,7 +438,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu)) + printk(KERN_ERR "microcode: failed to init CPU%d\n", + cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); -- cgit v1.2.3-59-g8ed1b From 4bae1967357bfc78a2fad1be5e81a4b868980ae6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 11 Mar 2009 11:19:46 +0100 Subject: x86: microcode: cleanup Impact: cleanup Cc: Rusty Russell Cc: Andrew Morton Cc: Dmitry Adamushko Cc: Peter Oruba LKML-Reference: <200903111632.37279.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 43 ++++++++++---------- arch/x86/kernel/microcode_core.c | 58 +++++++++++++-------------- arch/x86/kernel/microcode_intel.c | 83 ++++++++++++++++++++++----------------- 3 files changed, 97 insertions(+), 87 deletions(-) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c25fdb382292..453b5795a5c6 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -12,31 +12,30 @@ * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. -*/ - + */ +#include #include -#include -#include -#include -#include -#include -#include -#include #include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include -#include -#include #include -#include -#include +#include +#include -#include -#include #include +#include +#include MODULE_DESCRIPTION("AMD Microcode Update Driver"); MODULE_AUTHOR("Peter Oruba"); @@ -72,8 +71,8 @@ struct microcode_header_amd { } __attribute__((packed)); struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[0]; + struct microcode_header_amd hdr; + unsigned int mpb[0]; }; #define UCODE_MAX_SIZE 2048 @@ -184,8 +183,8 @@ static int get_ucode_data(void *to, const u8 *from, size_t n) return 0; } -static void *get_next_ucode(const u8 *buf, unsigned int size, - unsigned int *mc_size) +static void * +get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) { unsigned int total_size; u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; @@ -223,7 +222,6 @@ static void *get_next_ucode(const u8 *buf, unsigned int size, return mc; } - static int install_equiv_cpu_table(const u8 *buf) { u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; @@ -372,4 +370,3 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } - diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9a8dbc000563..a0f3851ef310 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,47 +70,47 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ +#include #include -#include -#include -#include +#include +#include #include +#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include +#include +#include -#include -#include -#include #include +#include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "2.00" +#define MICROCODE_VERSION "2.00" -static struct microcode_ops *microcode_ops; +static struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); -struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE struct update_for_cpu { - const void __user *buf; - size_t size; + const void __user *buf; + size_t size; }; static long update_for_cpu(void *_ufc) @@ -209,12 +209,12 @@ static void microcode_dev_exit(void) MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) #endif /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; static long reload_for_cpu(void *unused) { @@ -282,8 +282,8 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; static void __microcode_fini_cpu(int cpu) @@ -353,7 +353,7 @@ static long microcode_update_cpu(void *unused) */ if (uci->valid) { err = microcode_resume_cpu(smp_processor_id()); - } else { + } else { collect_cpu_info(smp_processor_id()); if (uci->valid && system_state == SYSTEM_RUNNING) err = microcode_ops->request_microcode_fw( @@ -423,9 +423,9 @@ static int mc_sysdev_resume(struct sys_device *dev) } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -464,7 +464,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) } static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, + .notifier_call = mc_cpu_callback, }; static int __init microcode_init(void) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 5e9f4fc51385..149b9ec7c1ab 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,28 +70,28 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ +#include #include -#include -#include -#include +#include +#include #include +#include #include -#include -#include +#include #include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include -#include +#include +#include -#include -#include #include +#include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -129,12 +129,13 @@ struct extended_sigtable { struct extended_signature sigs[0]; }; -#define DEFAULT_UCODE_DATASIZE (2000) +#define DEFAULT_UCODE_DATASIZE (2000) #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) #define DWSIZE (sizeof(u32)) + #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.totalsize ? \ ((struct microcode_intel *)mc)->hdr.totalsize : \ @@ -197,30 +198,31 @@ static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) } static inline int -update_match_revision(struct microcode_header_intel *mc_header, int rev) +update_match_revision(struct microcode_header_intel *mc_header, int rev) { return (mc_header->rev <= rev) ? 0 : 1; } static int microcode_sanity_check(void *mc) { + unsigned long total_size, data_size, ext_table_size; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; - struct extended_signature *ext_sig; - unsigned long total_size, data_size, ext_table_size; int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; total_size = get_totalsize(mc_header); data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + "Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + "Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); @@ -318,11 +320,15 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) static void apply_microcode(int cpu) { + struct microcode_intel *mc_intel; + struct ucode_cpu_info *uci; unsigned long flags; unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_intel *mc_intel = uci->mc; + int cpu_num; + + cpu_num = raw_smp_processor_id(); + uci = ucode_cpu_info + cpu; + mc_intel = uci->mc; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -348,15 +354,17 @@ static void apply_microcode(int cpu) spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); + "0x%x to 0x%x failed\n", + cpu_num, uci->cpu_sig.rev, val[1]); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", + "0x%x to 0x%x, date = %04x-%02x-%02x \n", cpu_num, uci->cpu_sig.rev, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); + uci->cpu_sig.rev = val[1]; } @@ -404,18 +412,23 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (new_mc) { - if (!leftover) { - if (uci->mc) - vfree(uci->mc); - uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - } else - vfree(new_mc); + if (!new_mc) + goto out; + + if (leftover) { + vfree(new_mc); + goto out; } + if (uci->mc) + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); + + out: return (int)leftover; } -- cgit v1.2.3-59-g8ed1b From 89bd55d1855f8e9a4e9add8e93f3144d049c469e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Mar 2009 16:31:29 +1030 Subject: x86: cpumask: update 32-bit APM not to mug current->cpus_allowed Impact: cleanup, avoid cpumask games The APM code wants to run on CPU 0: we create an "on_cpu0" wrapper which uses work_on_cpu() if we're not already on cpu 0. This introduces a new failure mode: -ENOMEM, so we add an explicit err arg and handle Linux-style errnos in apm_err(). Signed-off-by: Rusty Russell Cc: Andrew Morton Cc: Stephen Rothwell LKML-Reference: <200903111631.29787.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 248 +++++++++++++++++++++++++++-------------------- 1 file changed, 145 insertions(+), 103 deletions(-) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 10033fe718e0..c1941be9fb17 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -466,7 +466,7 @@ static const lookup_t error_table[] = { * @err: APM BIOS return code * * Write a meaningful log entry to the kernel log in the event of - * an APM error. + * an APM error. Note that this also handles (negative) kernel errors. */ static void apm_error(char *str, int err) @@ -478,42 +478,13 @@ static void apm_error(char *str, int err) break; if (i < ERROR_COUNT) printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + else if (err < 0) + printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); else printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", str, err); } -/* - * Lock APM functionality to physical CPU 0 - */ - -#ifdef CONFIG_SMP - -static cpumask_t apm_save_cpus(void) -{ - cpumask_t x = current->cpus_allowed; - /* Some bioses don't like being called from CPU != 0 */ - set_cpus_allowed(current, cpumask_of_cpu(0)); - BUG_ON(smp_processor_id() != 0); - return x; -} - -static inline void apm_restore_cpus(cpumask_t mask) -{ - set_cpus_allowed(current, mask); -} - -#else - -/* - * No CPU lockdown needed on a uniprocessor - */ - -#define apm_save_cpus() (current->cpus_allowed) -#define apm_restore_cpus(x) (void)(x) - -#endif - /* * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and * apm_info.allow_ints, we are being really paranoid here! Not only @@ -568,16 +539,23 @@ static inline void apm_irq_restore(unsigned long flags) # define APM_DO_RESTORE_SEGS #endif +struct apm_bios_call { + u32 func; + /* In and out */ + u32 ebx; + u32 ecx; + /* Out only */ + u32 eax; + u32 edx; + u32 esi; + + /* Error: -ENOMEM, or bits 8-15 of eax */ + int err; +}; + /** - * apm_bios_call - Make an APM BIOS 32bit call - * @func: APM function to execute - * @ebx_in: EBX register for call entry - * @ecx_in: ECX register for call entry - * @eax: EAX register return - * @ebx: EBX register return - * @ecx: ECX register return - * @edx: EDX register return - * @esi: ESI register return + * __apm_bios_call - Make an APM BIOS 32bit call + * @_call: pointer to struct apm_bios_call. * * Make an APM call using the 32bit protected mode interface. The * caller is responsible for knowing if APM BIOS is configured and @@ -586,79 +564,141 @@ static inline void apm_irq_restore(unsigned long flags) * flag is loaded into AL. If there is an error, then the error * code is returned in AH (bits 8-15 of eax) and this function * returns non-zero. + * + * Note: this makes the call on the current CPU. */ - -static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) +static long __apm_bios_call(void *_call) { APM_DECL_SEGS unsigned long flags; - cpumask_t cpus; int cpu; struct desc_struct save_desc_40; struct desc_struct *gdt; - - cpus = apm_save_cpus(); + struct apm_bios_call *call = _call; cpu = get_cpu(); + BUG_ON(cpu != 0); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); APM_DO_SAVE_SEGS; - apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); + apm_bios_call_asm(call->func, call->ebx, call->ecx, + &call->eax, &call->ebx, &call->ecx, &call->edx, + &call->esi); APM_DO_RESTORE_SEGS; apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); - apm_restore_cpus(cpus); - return *eax & 0xff; + return call->eax & 0xff; +} + +/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */ +static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call) +{ + int ret; + + /* Don't bother with work_on_cpu in the common case, so we don't + * have to worry about OOM or overhead. */ + if (get_cpu() == 0) { + ret = fn(call); + put_cpu(); + } else { + put_cpu(); + ret = work_on_cpu(0, fn, call); + } + + /* work_on_cpu can fail with -ENOMEM */ + if (ret < 0) + call->err = ret; + else + call->err = (call->eax >> 8) & 0xff; + + return ret; } /** - * apm_bios_call_simple - make a simple APM BIOS 32bit call - * @func: APM function to invoke - * @ebx_in: EBX register value for BIOS call - * @ecx_in: ECX register value for BIOS call - * @eax: EAX register on return from the BIOS call + * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0) + * @call: the apm_bios_call registers. + * + * If there is an error, it is returned in @call.err. + */ +static int apm_bios_call(struct apm_bios_call *call) +{ + return on_cpu0(__apm_bios_call, call); +} + +/** + * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0) + * @_call: pointer to struct apm_bios_call. * * Make a BIOS call that returns one value only, or just status. * If there is an error, then the error code is returned in AH - * (bits 8-15 of eax) and this function returns non-zero. This is - * used for simpler BIOS operations. This call may hold interrupts - * off for a long time on some laptops. + * (bits 8-15 of eax) and this function returns non-zero (it can + * also return -ENOMEM). This is used for simpler BIOS operations. + * This call may hold interrupts off for a long time on some laptops. + * + * Note: this makes the call on the current CPU. */ - -static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) +static long __apm_bios_call_simple(void *_call) { u8 error; APM_DECL_SEGS unsigned long flags; - cpumask_t cpus; int cpu; struct desc_struct save_desc_40; struct desc_struct *gdt; - - cpus = apm_save_cpus(); + struct apm_bios_call *call = _call; cpu = get_cpu(); + BUG_ON(cpu != 0); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); APM_DO_SAVE_SEGS; - error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); + error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, + &call->eax); APM_DO_RESTORE_SEGS; apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); - apm_restore_cpus(cpus); return error; } +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * @err: bits + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in @err + * and this function returns non-zero. This is used for simpler + * BIOS operations. This call may hold interrupts off for a long + * time on some laptops. + */ +static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, + int *err) +{ + struct apm_bios_call call; + int ret; + + call.func = func; + call.ebx = ebx_in; + call.ecx = ecx_in; + + ret = on_cpu0(__apm_bios_call_simple, &call); + *eax = call.eax; + *err = call.err; + return ret; +} + /** * apm_driver_version - APM driver version * @val: loaded with the APM version on return @@ -678,9 +718,10 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) static int apm_driver_version(u_short *val) { u32 eax; + int err; - if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) - return (eax >> 8) & 0xff; + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err)) + return err; *val = eax; return APM_SUCCESS; } @@ -701,22 +742,21 @@ static int apm_driver_version(u_short *val) * that APM 1.2 is in use. If no messges are pending the value 0x80 * is returned (No power management events pending). */ - static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) { - u32 eax; - u32 ebx; - u32 ecx; - u32 dummy; + struct apm_bios_call call; - if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, - &dummy, &dummy)) - return (eax >> 8) & 0xff; - *event = ebx; + call.func = APM_FUNC_GET_EVENT; + call.ebx = call.ecx = 0; + + if (apm_bios_call(&call)) + return call.err; + + *event = call.ebx; if (apm_info.connection_version < 0x0102) *info = ~0; /* indicate info not valid */ else - *info = ecx; + *info = call.ecx; return APM_SUCCESS; } @@ -737,9 +777,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) static int set_power_state(u_short what, u_short state) { u32 eax; + int err; - if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) - return (eax >> 8) & 0xff; + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err)) + return err; return APM_SUCCESS; } @@ -770,6 +811,7 @@ static int apm_do_idle(void) u8 ret = 0; int idled = 0; int polling; + int err; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -782,7 +824,7 @@ static int apm_do_idle(void) } if (!need_resched()) { idled = 1; - ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); } if (polling) current_thread_info()->status |= TS_POLLING; @@ -797,8 +839,7 @@ static int apm_do_idle(void) * Only report the failure the first 5 times. */ if (++t < 5) { - printk(KERN_DEBUG "apm_do_idle failed (%d)\n", - (eax >> 8) & 0xff); + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err); t = jiffies; } return -1; @@ -816,9 +857,10 @@ static int apm_do_idle(void) static void apm_do_busy(void) { u32 dummy; + int err; if (clock_slowed || ALWAYS_CALL_BUSY) { - (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err); clock_slowed = 0; } } @@ -937,7 +979,7 @@ static void apm_power_off(void) /* Some bioses don't like being called from CPU != 0 */ if (apm_info.realmode_power_off) { - (void)apm_save_cpus(); + set_cpus_allowed_ptr(current, cpumask_of(0)); machine_real_restart(po_bios_call, sizeof(po_bios_call)); } else { (void)set_system_power_state(APM_STATE_OFF); @@ -956,12 +998,13 @@ static void apm_power_off(void) static int apm_enable_power_management(int enable) { u32 eax; + int err; if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) return APM_NOT_ENGAGED; if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, - enable, &eax)) - return (eax >> 8) & 0xff; + enable, &eax, &err)) + return err; if (enable) apm_info.bios.flags &= ~APM_BIOS_DISABLED; else @@ -986,24 +1029,23 @@ static int apm_enable_power_management(int enable) static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 dummy; + struct apm_bios_call call; + + call.func = APM_FUNC_GET_STATUS; + call.ebx = APM_DEVICE_ALL; + call.ecx = 0; if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; - if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, - &eax, &ebx, &ecx, &edx, &dummy)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; + if (apm_bios_call(&call)) + return call.err; + *status = call.ebx; + *bat = call.ecx; if (apm_info.get_power_status_swabinminutes) { - *life = swab16((u16)edx); + *life = swab16((u16)call.edx); *life |= 0x8000; } else - *life = edx; + *life = call.edx; return APM_SUCCESS; } @@ -1048,12 +1090,14 @@ static int apm_get_battery_status(u_short which, u_short *status, static int apm_engage_power_management(u_short device, int enable) { u32 eax; + int err; if ((enable == 0) && (device == APM_DEVICE_ALL) && (apm_info.bios.flags & APM_BIOS_DISABLED)) return APM_DISABLED; - if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) - return (eax >> 8) & 0xff; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, + &eax, &err)) + return err; if (device == APM_DEVICE_ALL) { if (enable) apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; @@ -1682,16 +1726,14 @@ static int apm(void *unused) char *power_stat; char *bat_stat; -#ifdef CONFIG_SMP /* 2002/08/01 - WT * This is to avoid random crashes at boot time during initialization * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. * Some bioses don't like being called from CPU != 0. * Method suggested by Ingo Molnar. */ - set_cpus_allowed(current, cpumask_of_cpu(0)); + set_cpus_allowed_ptr(current, cpumask_of(0)); BUG_ON(smp_processor_id() != 0); -#endif if (apm_info.connection_version == 0) { apm_info.connection_version = apm_info.bios.version; -- cgit v1.2.3-59-g8ed1b From c38da5692e3a4d5d303c04cbf7e526f1eb761076 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 11 Mar 2009 16:33:55 +1030 Subject: x86: cpumask: x86 mmio-mod.c use cpumask_var_t for downed_cpus Impact: cleanup, reduce memory usage for CONFIG_CPUMASK_OFFSTACK=y Part of the "getting rid of obsolete cpumask_t" patch: 1) Use cpumask_var_t: this is a pointer if CONFIG_CPUMASK_OFFSTACK=y 2) Call alloc_cpumask_var() on first entry into enter_uniprocessor() 3) Use modern cpumask_* functions. Signed-off-by: Rusty Russell Cc: Pekka Paalanen LKML-Reference: <200903111633.55952.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 2c4baa88f2cb..c9342ed8b402 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -378,27 +378,34 @@ static void clear_trace_list(void) } #ifdef CONFIG_HOTPLUG_CPU -static cpumask_t downed_cpus; +static cpumask_var_t downed_cpus; static void enter_uniprocessor(void) { int cpu; int err; + if (downed_cpus == NULL && + !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { + pr_notice(NAME "Failed to allocate mask\n"); + goto out; + } + get_online_cpus(); - downed_cpus = cpu_online_map; - cpu_clear(first_cpu(cpu_online_map), downed_cpus); + cpumask_copy(downed_cpus, cpu_online_mask); + cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); if (num_online_cpus() > 1) pr_notice(NAME "Disabling non-boot CPUs...\n"); put_online_cpus(); - for_each_cpu_mask(cpu, downed_cpus) { + for_each_cpu(cpu, downed_cpus) { err = cpu_down(cpu); if (!err) pr_info(NAME "CPU%d is down.\n", cpu); else pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); } +out: if (num_online_cpus() > 1) pr_warning(NAME "multiple CPUs still online, " "may miss events.\n"); @@ -411,10 +418,10 @@ static void __ref leave_uniprocessor(void) int cpu; int err; - if (cpus_weight(downed_cpus) == 0) + if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) return; pr_notice(NAME "Re-enabling CPUs...\n"); - for_each_cpu_mask(cpu, downed_cpus) { + for_each_cpu(cpu, downed_cpus) { err = cpu_up(cpu); if (!err) pr_info(NAME "enabled CPU%d.\n", cpu); -- cgit v1.2.3-59-g8ed1b From 4acd4d00f716873e27e7b60ae292cbdbfae674dd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Mar 2009 10:40:24 -0400 Subject: tracing: give easy way to clear trace buffer There is currently no easy way to clear the trace buffer. Currently the only way is to change the current tracer. This patch lets the user clear the trace buffer by simply writing into the trace files. echo > /debug/tracing/trace or to clear a single cpu (i.e. for CPU 1): echo > /debug/tracing/per_cpu/cpu1/trace Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a2d13e8c8fd8..8d981ababc45 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1941,9 +1941,14 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct trace_iterator *iter = m->private; + struct trace_iterator *iter; int cpu; + if (!(file->f_mode & FMODE_READ)) + return 0; + + iter = m->private; + mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) @@ -1969,12 +1974,24 @@ static int tracing_open(struct inode *inode, struct file *file) struct trace_iterator *iter; int ret = 0; - iter = __tracing_open(inode, file); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - else if (trace_flags & TRACE_ITER_LATENCY_FMT) - iter->iter_flags |= TRACE_FILE_LAT_FMT; + /* If this file was open for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && + !(file->f_flags & O_APPEND)) { + long cpu = (long) inode->i_private; + + if (cpu == TRACE_PIPE_ALL_CPU) + tracing_reset_online_cpus(&global_trace); + else + tracing_reset(&global_trace, cpu); + } + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + } return ret; } @@ -2049,9 +2066,17 @@ static int show_traces_open(struct inode *inode, struct file *file) return ret; } +static ssize_t +tracing_write_stub(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + return count; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, + .write = tracing_write_stub, .llseek = seq_lseek, .release = tracing_release, }; @@ -3576,7 +3601,7 @@ static void tracing_init_debugfs_percpu(long cpu) pr_warning("Could not create debugfs 'trace_pipe' entry\n"); /* per cpu trace */ - entry = debugfs_create_file("trace", 0444, d_cpu, + entry = debugfs_create_file("trace", 0644, d_cpu, (void *) cpu, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); @@ -3890,7 +3915,7 @@ static __init int tracer_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - entry = debugfs_create_file("trace", 0444, d_tracer, + entry = debugfs_create_file("trace", 0644, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); if (!entry) pr_warning("Could not create debugfs 'trace' entry\n"); -- cgit v1.2.3-59-g8ed1b From 09933a108e6730a464a1ab676c9decc11aee0edc Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 18 Mar 2009 22:18:56 +0530 Subject: tracing: fix oops in tracepoint_update_probe_range() Change this crash: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] tracepoint_update_probe_range+0x1f/0x9b PGD 13d5fb067 PUD 13d688067 PMD 0 Oops: 0000 [#1] SMP To a more debuggable WARN_ONCE(). Signed-off-by: Jaswinder Singh Rajput Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain> [ moved the check outside the lock and added a WARN_ON(). ] Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 79602740bbb5..adf28734fe2d 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -272,12 +272,17 @@ static void disable_tracepoint(struct tracepoint *elem) * * Updates the probe callback corresponding to a range of tracepoints. */ -void tracepoint_update_probe_range(struct tracepoint *begin, - struct tracepoint *end) +void +tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) { struct tracepoint *iter; struct tracepoint_entry *mark_entry; + if (!begin) { + WARN_ON_ONCE(1); + return; + } + mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_tracepoint(iter->name); -- cgit v1.2.3-59-g8ed1b From ec625cb29e66824f7ce41082617aeb93fa4e42e2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Mar 2009 19:54:04 +0100 Subject: tracepoints: dont update zero-sized tracepoint sections Zero-sized tracepoint sections can occur if tracing is enabled but no tracepoint is defined. Do not emit a warning in that case. Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Jaswinder Singh Rajput LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index adf28734fe2d..1ef5d3a601c7 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -278,10 +278,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) struct tracepoint *iter; struct tracepoint_entry *mark_entry; - if (!begin) { - WARN_ON_ONCE(1); + if (!begin) return; - } mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { -- cgit v1.2.3-59-g8ed1b From 2283963f27fdd56b185e49a964c290130c7c95ab Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 11 Jan 2009 14:27:17 -0500 Subject: nfsd4: split lockstateid/openstateid release logic The flags here attempt to make the code more general, but I find it actually just adds confusion. I think it's clearer to separate the logic for the open and lock cases entirely. And eventually we may want to separate the stateowner and stateid types as well, as many of the fields aren't shared between the lock and open cases. Also move to eliminate forward references. Start with the stateid's. Signed-off-by: J. Bruce Fields Reviewed-by: Benny Halevy --- fs/nfsd/nfs4state.c | 55 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b6f60f48e94b..e83b3c865aa3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -119,7 +119,6 @@ opaque_hashval(const void *ptr, int nbytes) /* forward declarations */ static void release_stateowner(struct nfs4_stateowner *sop); -static void release_stateid(struct nfs4_stateid *stp, int flags); /* * Delegation state @@ -311,6 +310,34 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct list_head client_lru; static struct list_head close_lru; +static void unhash_generic_stateid(struct nfs4_stateid *stp) +{ + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); +} + +static void free_generic_stateid(struct nfs4_stateid *stp) +{ + put_nfs4_file(stp->st_file); + kmem_cache_free(stateid_slab, stp); +} + +static void release_lock_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); + free_generic_stateid(stp); +} + +static void release_open_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); + nfsd_close(stp->st_vfs_file); + free_generic_stateid(stp); +} + static inline void renew_client(struct nfs4_client *clp) { @@ -1065,9 +1092,9 @@ unhash_stateowner(struct nfs4_stateowner *sop) stp = list_entry(sop->so_stateids.next, struct nfs4_stateid, st_perstateowner); if (sop->so_is_open_owner) - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); else - release_stateid(stp, LOCK_STATE); + release_lock_stateid(stp); } } @@ -1105,24 +1132,6 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_openstp = NULL; } -static void -release_stateid(struct nfs4_stateid *stp, int flags) -{ - struct file *filp = stp->st_vfs_file; - - list_del(&stp->st_hash); - list_del(&stp->st_perfile); - list_del(&stp->st_perstateowner); - if (flags & OPEN_STATE) { - release_stateid_lockowners(stp); - stp->st_vfs_file = NULL; - nfsd_close(filp); - } else if (flags & LOCK_STATE) - locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner); - put_nfs4_file(stp->st_file); - kmem_cache_free(stateid_slab, stp); -} - static void move_to_close_lru(struct nfs4_stateowner *sop) { @@ -1764,7 +1773,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf init_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); goto out; } } @@ -2373,7 +2382,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); /* release_stateid() calls nfsd_close() if needed */ - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period -- cgit v1.2.3-59-g8ed1b From f1d110caf7d759eae2c02c84343f63d83db9b9be Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 11 Jan 2009 14:37:31 -0500 Subject: nfsd4: remove a forward declaration Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 85 ++++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e83b3c865aa3..6ab30772496e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -75,7 +75,6 @@ static stateid_t onestateid; /* bits all 1 */ /* forward declarations */ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static void release_stateid_lockowners(struct nfs4_stateid *open_stp); static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); @@ -330,6 +329,20 @@ static void release_lock_stateid(struct nfs4_stateid *stp) free_generic_stateid(stp); } +static void +release_stateid_lockowners(struct nfs4_stateid *open_stp) +{ + struct nfs4_stateowner *lock_sop; + + while (!list_empty(&open_stp->st_lockowners)) { + lock_sop = list_entry(open_stp->st_lockowners.next, + struct nfs4_stateowner, so_perstateid); + /* list_del(&open_stp->st_lockowners); */ + BUG_ON(lock_sop->so_is_open_owner); + release_stateowner(lock_sop); + } +} + static void release_open_stateid(struct nfs4_stateid *stp) { unhash_generic_stateid(stp); @@ -338,6 +351,34 @@ static void release_open_stateid(struct nfs4_stateid *stp) free_generic_stateid(stp); } +static void +unhash_stateowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + if (sop->so_is_open_owner) + list_del(&sop->so_perclient); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_entry(sop->so_stateids.next, + struct nfs4_stateid, st_perstateowner); + if (sop->so_is_open_owner) + release_open_stateid(stp); + else + release_lock_stateid(stp); + } +} + +static void +release_stateowner(struct nfs4_stateowner *sop) +{ + unhash_stateowner(sop); + list_del(&sop->so_close_lru); + nfs4_put_stateowner(sop); +} + static inline void renew_client(struct nfs4_client *clp) { @@ -1064,48 +1105,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return sop; } -static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) -{ - struct nfs4_stateowner *lock_sop; - - while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_stateowner(lock_sop); - } -} - -static void -unhash_stateowner(struct nfs4_stateowner *sop) -{ - struct nfs4_stateid *stp; - - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - if (sop->so_is_open_owner) - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_entry(sop->so_stateids.next, - struct nfs4_stateid, st_perstateowner); - if (sop->so_is_open_owner) - release_open_stateid(stp); - else - release_lock_stateid(stp); - } -} - -static void -release_stateowner(struct nfs4_stateowner *sop) -{ - unhash_stateowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); -} - static inline void init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_stateowner *sop = open->op_stateowner; -- cgit v1.2.3-59-g8ed1b From f044ff830f1afe91e4388320f0c7b6e08d2e05f8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 11 Jan 2009 15:24:04 -0500 Subject: nfsd4: split open/lockowner release code The caller always knows specifically whether it's releasing a lockowner or an openowner, and the code is simpler if we use separate functions (and the apparent recursion is gone). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 57 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6ab30772496e..41a3590ef2cc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -116,9 +116,6 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -/* forward declarations */ -static void release_stateowner(struct nfs4_stateowner *sop); - /* * Delegation state */ @@ -329,6 +326,26 @@ static void release_lock_stateid(struct nfs4_stateid *stp) free_generic_stateid(stp); } +static void unhash_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_lock_stateid(stp); + } +} + +static void release_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner(sop); + nfs4_put_stateowner(sop); +} + static void release_stateid_lockowners(struct nfs4_stateid *open_stp) { @@ -339,7 +356,7 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp) struct nfs4_stateowner, so_perstateid); /* list_del(&open_stp->st_lockowners); */ BUG_ON(lock_sop->so_is_open_owner); - release_stateowner(lock_sop); + release_lockowner(lock_sop); } } @@ -351,30 +368,24 @@ static void release_open_stateid(struct nfs4_stateid *stp) free_generic_stateid(stp); } -static void -unhash_stateowner(struct nfs4_stateowner *sop) +static void unhash_openowner(struct nfs4_stateowner *sop) { struct nfs4_stateid *stp; list_del(&sop->so_idhash); list_del(&sop->so_strhash); - if (sop->so_is_open_owner) - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); + list_del(&sop->so_perclient); + list_del(&sop->so_perstateid); /* XXX: necessary? */ while (!list_empty(&sop->so_stateids)) { - stp = list_entry(sop->so_stateids.next, - struct nfs4_stateid, st_perstateowner); - if (sop->so_is_open_owner) - release_open_stateid(stp); - else - release_lock_stateid(stp); + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_open_stateid(stp); } } -static void -release_stateowner(struct nfs4_stateowner *sop) +static void release_openowner(struct nfs4_stateowner *sop) { - unhash_stateowner(sop); + unhash_openowner(sop); list_del(&sop->so_close_lru); nfs4_put_stateowner(sop); } @@ -488,7 +499,7 @@ expire_client(struct nfs4_client *clp) list_del(&clp->cl_lru); while (!list_empty(&clp->cl_openowners)) { sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_stateowner(sop); + release_openowner(sop); } put_nfs4_client(clp); } @@ -1443,7 +1454,7 @@ nfsd4_process_open1(struct nfsd4_open *open) if (!sop->so_confirmed) { /* Replace unconfirmed owners without checking for replay. */ clp = sop->so_client; - release_stateowner(sop); + release_openowner(sop); open->op_stateowner = NULL; goto renew; } @@ -1906,7 +1917,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - release_stateowner(sop); + release_openowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -2796,7 +2807,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: if (status && lock->lk_is_new && lock_sop) - release_stateowner(lock_sop); + release_lockowner(lock_sop); if (lock->lk_replay_owner) { nfs4_get_stateowner(lock->lk_replay_owner); cstate->replay_owner = lock->lk_replay_owner; @@ -3045,7 +3056,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, /* unhash_stateowner deletes so_perclient only * for openowners. */ list_del(&sop->so_perclient); - release_stateowner(sop); + release_lockowner(sop); } out: nfs4_unlock_state(); -- cgit v1.2.3-59-g8ed1b From 12214cb78147d42da28bc0d1b2917584e0a8efcd Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Mon, 12 Jan 2009 03:13:53 +0800 Subject: NFSD: cleanup for nfs3proc.c MSDOS_SUPER_MAGIC is defined in , so use MSDOS_SUPER_MAGIC directly. Signed-off-by: Qinghuang Feng Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9dbd2eb91281..579ce8c69daa 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -569,7 +570,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; /* Note that we don't care for remote fs's here */ - if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { + if (sb->s_magic == MSDOS_SUPER_MAGIC) { resp->f_properties = NFS3_FSF_BILLYBOY; } resp->f_maxfilesize = sb->s_maxbytes; @@ -610,7 +611,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->p_link_max = EXT2_LINK_MAX; resp->p_name_max = EXT2_NAME_LEN; break; - case 0x4d44: /* MSDOS_SUPER_MAGIC */ + case MSDOS_SUPER_MAGIC: resp->p_case_insensitive = 1; resp->p_case_preserving = 0; break; -- cgit v1.2.3-59-g8ed1b From 686665619e4424b4f80c3a49e3e1229f27ea565c Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Thu, 29 Jan 2009 20:53:57 +0530 Subject: nfsd : Define NFSD only when FILE_LOCKING is enabled Enable NFSD only when FILE_LOCKING is enabled, since we don't want to support NFSD without FILE_LOCKING. Signed-off-by: Manish Katiyar Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 44d7d04dab95..503b9da159a3 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -1,6 +1,7 @@ config NFSD tristate "NFS server support" depends on INET + depends on FILE_LOCKING select LOCKD select SUNRPC select EXPORTFS -- cgit v1.2.3-59-g8ed1b From 99f88726381f676bba6e7dcf74b7412857d7946a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 2 Feb 2009 15:12:27 -0500 Subject: nfsd: clarify exclusive create bitmask result. The use of |= is confusing--the bitmask is always initialized to zero in this case, so we're effectively just doing an assignment here. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9fa60a3ad48c..85b9a084177a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -103,11 +103,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o (u32 *)open->op_verf.data, &open->op_truncate, &created); - /* If we ever decide to use different attrs to store the - * verifier in nfsd_create_v3, then we'll need to change this + /* + * Following rfc 3530 14.2.16, use the returned bitmask + * to indicate which attributes we used to store the + * verifier: */ if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) - open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else { status = nfsd_lookup(rqstp, current_fh, -- cgit v1.2.3-59-g8ed1b From 13024b7b4097d33fe6bba34b1c83602e88753270 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 2 Feb 2009 17:04:03 -0500 Subject: nfsd4: fix misplaced comment Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 85b9a084177a..326506272258 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -120,9 +120,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o goto out; set_change_info(&open->op_cinfo, current_fh); + fh_dup2(current_fh, &resfh); /* set reply cache */ - fh_dup2(current_fh, &resfh); open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size; memcpy(open->op_stateowner->so_replay.rp_openfh, &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); -- cgit v1.2.3-59-g8ed1b From a4773c08f2872626cb923433284488fbe8acb0ae Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 2 Feb 2009 17:23:10 -0500 Subject: nfsd4: use helper for copying filehandles for replay Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 17 ++++++----------- include/linux/nfsd/nfsfh.h | 7 +++++++ include/linux/nfsd/state.h | 3 +-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 326506272258..af66073ed423 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -123,10 +123,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o fh_dup2(current_fh, &resfh); /* set reply cache */ - open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); - + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + &resfh.fh_handle); if (!created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -152,10 +150,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); @@ -187,9 +183,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_stateowner->so_replay; fh_put(&cstate->current_fh); - cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; - memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, - rp->rp_openfh_len); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index fa317f6c154b..afa19016c4a8 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -269,6 +269,13 @@ fh_copy(struct svc_fh *dst, struct svc_fh *src) return dst; } +static inline void +fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src) +{ + dst->fh_size = src->fh_size; + memcpy(&dst->fh_base, &src->fh_base, src->fh_size); +} + static __inline__ struct svc_fh * fh_init(struct svc_fh *fhp, int maxsize) { diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 128298c0362d..b65b2a6274b0 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -168,8 +168,7 @@ struct nfs4_replay { unsigned int rp_buflen; char *rp_buf; unsigned intrp_allocated; - int rp_openfh_len; - char rp_openfh[NFS4_FHSIZE]; + struct knfsd_fh rp_openfh; char rp_ibuf[NFSD4_REPLAY_ISIZE]; }; -- cgit v1.2.3-59-g8ed1b From 6c02eaa1d1e53b9b2cc27d0c6fff3e57da4b611f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 2 Feb 2009 17:30:51 -0500 Subject: nfsd4: use helper for copying delegation filehandle Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 4 ++-- fs/nfsd/nfs4state.c | 4 +--- include/linux/nfsd/state.h | 6 ++---- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c464181b5994..c6804db33c17 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -218,7 +218,7 @@ static int encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) { __be32 *p; - int len = cb_rec->cbr_fhlen; + int len = cb_rec->cbr_fh.fh_size; RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); @@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); - WRITEMEM(cb_rec->cbr_fhval, len); + WRITEMEM(&cb_rec->cbr_fh.fh_base, len); return 0; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 41a3590ef2cc..7f616e928a57 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -215,9 +215,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; dp->dl_stateid.si_generation = 0; - dp->dl_fhlen = current_fh->fh_handle.fh_size; - memcpy(dp->dl_fhval, ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index b65b2a6274b0..1130d534bb63 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -66,8 +66,7 @@ struct nfs4_cb_recall { u32 cbr_ident; int cbr_trunc; stateid_t cbr_stateid; - u32 cbr_fhlen; - char cbr_fhval[NFS4_FHSIZE]; + struct knfsd_fh cbr_fh; struct nfs4_delegation *cbr_dp; }; @@ -86,8 +85,7 @@ struct nfs4_delegation { }; #define dl_stateid dl_recall.cbr_stateid -#define dl_fhlen dl_recall.cbr_fhlen -#define dl_fhval dl_recall.cbr_fhval +#define dl_fh dl_recall.cbr_fh /* client delegation callback info */ struct nfs4_callback { -- cgit v1.2.3-59-g8ed1b From e37da04ed145d45c2a698d7cb373a7e1191fbe86 Mon Sep 17 00:00:00 2001 From: Alexandros Batsakis Date: Thu, 18 Dec 2008 19:55:16 -0800 Subject: nfsd: lock state around put client and delegation in nfsd4_cb_recall not having the state locked before putting the client/delegation causes a bug. Also removed the comment from the function header about the state being already locked Signed-off-by: Alexandros Batsakis Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c6804db33c17..3ddc9fb2e358 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -451,7 +451,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) /* * called with dp->dl_count inc'ed. - * nfs4_lock_state() may or may not have been called. */ void nfsd4_cb_recall(struct nfs4_delegation *dp) @@ -491,7 +490,9 @@ out_put_cred: * Success or failure, now we're either waiting for lease expiration * or deleg_return. */ + nfs4_lock_state(); put_nfs4_client(clp); nfs4_put_delegation(dp); + nfs4_unlock_state(); return; } -- cgit v1.2.3-59-g8ed1b From e33d1ea60c3a17b8b5c2910b1eef4c1faf0ac450 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 9 Feb 2009 12:30:43 -0500 Subject: lockd: clean up blocking lock cases of nlsmvc_lock() No change in behavior, just rearranging the switch so that we break out of the switch if and only if we're in the wait case. Signed-off-by: Miklos Szeredi Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 763b78a6e9de..83ee34203bd7 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_granted; goto out; case -EAGAIN: + /* + * If this is a blocking request for an + * already pending lock request then we need + * to put it back on lockd's block list + */ + if (wait) + break; ret = nlm_lck_denied; - break; + goto out; case FILE_LOCK_DEFERRED: if (wait) break; @@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - ret = nlm_lck_denied; - if (!wait) - goto out; - ret = nlm_lck_blocked; /* Append to list of blocked */ -- cgit v1.2.3-59-g8ed1b From 4ac35c2f794503d3acda20d98e89cf63f6e94332 Mon Sep 17 00:00:00 2001 From: wengang wang Date: Tue, 10 Feb 2009 11:27:51 +0800 Subject: nfsd(v2/v3): fix the failure of creation from HPUX client sometimes HPUX nfs client sends a create request to linux nfs server(v2/v3). the dump of the request is like: obj_attributes mode: value follows set_it: value follows (1) mode: 00 uid: no value set_it: no value (0) gid: value follows set_it: value follows (1) gid: 8030 size: value follows set_it: value follows (1) size: 0 atime: don't change set_it: don't change (0) mtime: don't change set_it: don't change (0) note that mode is 00(havs no rwx privilege even for the owner) and it requires to set size to 0. as current nfsd(v2/v3) implementation, the server does mainly 2 steps: 1) creates the file in mode specified by calling vfs_create(). 2) sets attributes for the file by calling nfsd_setattr(). at step 2), it finally calls file system specific setattr() function which may fail when checking permission because changing size needs WRITE privilege but it has none since mode is 000. for this case, a new file created, we may simply ignore the request of setting size to 0, so that WRITE privilege is not needed and the open succeeds. Signed-off-by: Wengang Wang -- vfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6e50aaa56ca2..0c076293155d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1176,6 +1176,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, return 0; } +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + /* * Create a file (regular, directory, device, fifo); UNIX sockets * not yet implemented. @@ -1271,6 +1286,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (!host_err) + nfsd_check_ignore_resizing(iap); break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); @@ -1424,6 +1441,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, /* setattr will sync the child (or not) */ } + nfsd_check_ignore_resizing(iap); + if (createmode == NFS3_CREATE_EXCLUSIVE) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME -- cgit v1.2.3-59-g8ed1b From 77f18f5e4ebdea35ec3d92343b0ed7546dc87637 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 11 Feb 2009 17:16:58 -0800 Subject: nfs: replace uses of __constant_{endian} The base versions handle constant folding now, none of these headers are exported to userspace, so the __ prefixed versions are not necessary. Signed-off-by: Harvey Harrison Reviewed-by: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/lockd/xdr.h | 12 ++--- include/linux/lockd/xdr4.h | 10 ++-- include/linux/nfsd/nfsd.h | 132 ++++++++++++++++++++++----------------------- include/linux/sunrpc/xdr.h | 42 +++++++-------- 4 files changed, 98 insertions(+), 98 deletions(-) diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 7dc5b6cb44cd..d39ed1cc5fbf 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -25,13 +25,13 @@ struct svc_rqst; #define NLM_MAXCOOKIELEN 32 #define NLM_MAXSTRLEN 1024 -#define nlm_granted __constant_htonl(NLM_LCK_GRANTED) -#define nlm_lck_denied __constant_htonl(NLM_LCK_DENIED) -#define nlm_lck_denied_nolocks __constant_htonl(NLM_LCK_DENIED_NOLOCKS) -#define nlm_lck_blocked __constant_htonl(NLM_LCK_BLOCKED) -#define nlm_lck_denied_grace_period __constant_htonl(NLM_LCK_DENIED_GRACE_PERIOD) +#define nlm_granted cpu_to_be32(NLM_LCK_GRANTED) +#define nlm_lck_denied cpu_to_be32(NLM_LCK_DENIED) +#define nlm_lck_denied_nolocks cpu_to_be32(NLM_LCK_DENIED_NOLOCKS) +#define nlm_lck_blocked cpu_to_be32(NLM_LCK_BLOCKED) +#define nlm_lck_denied_grace_period cpu_to_be32(NLM_LCK_DENIED_GRACE_PERIOD) -#define nlm_drop_reply __constant_htonl(30000) +#define nlm_drop_reply cpu_to_be32(30000) /* Lock info passed via NLM */ struct nlm_lock { diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h index 12bfe09de2b1..7353821341ed 100644 --- a/include/linux/lockd/xdr4.h +++ b/include/linux/lockd/xdr4.h @@ -15,11 +15,11 @@ #include /* error codes new to NLMv4 */ -#define nlm4_deadlock __constant_htonl(NLM_DEADLCK) -#define nlm4_rofs __constant_htonl(NLM_ROFS) -#define nlm4_stale_fh __constant_htonl(NLM_STALE_FH) -#define nlm4_fbig __constant_htonl(NLM_FBIG) -#define nlm4_failed __constant_htonl(NLM_FAILED) +#define nlm4_deadlock cpu_to_be32(NLM_DEADLCK) +#define nlm4_rofs cpu_to_be32(NLM_ROFS) +#define nlm4_stale_fh cpu_to_be32(NLM_STALE_FH) +#define nlm4_fbig cpu_to_be32(NLM_FBIG) +#define nlm4_failed cpu_to_be32(NLM_FAILED) diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index e19f45991b2e..16f7b403d9c1 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -186,78 +186,78 @@ void nfsd_lockd_shutdown(void); /* * These macros provide pre-xdr'ed values for faster operation. */ -#define nfs_ok __constant_htonl(NFS_OK) -#define nfserr_perm __constant_htonl(NFSERR_PERM) -#define nfserr_noent __constant_htonl(NFSERR_NOENT) -#define nfserr_io __constant_htonl(NFSERR_IO) -#define nfserr_nxio __constant_htonl(NFSERR_NXIO) -#define nfserr_eagain __constant_htonl(NFSERR_EAGAIN) -#define nfserr_acces __constant_htonl(NFSERR_ACCES) -#define nfserr_exist __constant_htonl(NFSERR_EXIST) -#define nfserr_xdev __constant_htonl(NFSERR_XDEV) -#define nfserr_nodev __constant_htonl(NFSERR_NODEV) -#define nfserr_notdir __constant_htonl(NFSERR_NOTDIR) -#define nfserr_isdir __constant_htonl(NFSERR_ISDIR) -#define nfserr_inval __constant_htonl(NFSERR_INVAL) -#define nfserr_fbig __constant_htonl(NFSERR_FBIG) -#define nfserr_nospc __constant_htonl(NFSERR_NOSPC) -#define nfserr_rofs __constant_htonl(NFSERR_ROFS) -#define nfserr_mlink __constant_htonl(NFSERR_MLINK) -#define nfserr_opnotsupp __constant_htonl(NFSERR_OPNOTSUPP) -#define nfserr_nametoolong __constant_htonl(NFSERR_NAMETOOLONG) -#define nfserr_notempty __constant_htonl(NFSERR_NOTEMPTY) -#define nfserr_dquot __constant_htonl(NFSERR_DQUOT) -#define nfserr_stale __constant_htonl(NFSERR_STALE) -#define nfserr_remote __constant_htonl(NFSERR_REMOTE) -#define nfserr_wflush __constant_htonl(NFSERR_WFLUSH) -#define nfserr_badhandle __constant_htonl(NFSERR_BADHANDLE) -#define nfserr_notsync __constant_htonl(NFSERR_NOT_SYNC) -#define nfserr_badcookie __constant_htonl(NFSERR_BAD_COOKIE) -#define nfserr_notsupp __constant_htonl(NFSERR_NOTSUPP) -#define nfserr_toosmall __constant_htonl(NFSERR_TOOSMALL) -#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT) -#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE) -#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX) -#define nfserr_denied __constant_htonl(NFSERR_DENIED) -#define nfserr_deadlock __constant_htonl(NFSERR_DEADLOCK) -#define nfserr_expired __constant_htonl(NFSERR_EXPIRED) -#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE) -#define nfserr_same __constant_htonl(NFSERR_SAME) -#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE) -#define nfserr_stale_clientid __constant_htonl(NFSERR_STALE_CLIENTID) -#define nfserr_resource __constant_htonl(NFSERR_RESOURCE) -#define nfserr_moved __constant_htonl(NFSERR_MOVED) -#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE) -#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH) -#define nfserr_share_denied __constant_htonl(NFSERR_SHARE_DENIED) -#define nfserr_stale_stateid __constant_htonl(NFSERR_STALE_STATEID) -#define nfserr_old_stateid __constant_htonl(NFSERR_OLD_STATEID) -#define nfserr_bad_stateid __constant_htonl(NFSERR_BAD_STATEID) -#define nfserr_bad_seqid __constant_htonl(NFSERR_BAD_SEQID) -#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK) -#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME) -#define nfserr_restorefh __constant_htonl(NFSERR_RESTOREFH) -#define nfserr_attrnotsupp __constant_htonl(NFSERR_ATTRNOTSUPP) -#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR) -#define nfserr_openmode __constant_htonl(NFSERR_OPENMODE) -#define nfserr_locks_held __constant_htonl(NFSERR_LOCKS_HELD) -#define nfserr_op_illegal __constant_htonl(NFSERR_OP_ILLEGAL) -#define nfserr_grace __constant_htonl(NFSERR_GRACE) -#define nfserr_no_grace __constant_htonl(NFSERR_NO_GRACE) -#define nfserr_reclaim_bad __constant_htonl(NFSERR_RECLAIM_BAD) -#define nfserr_badname __constant_htonl(NFSERR_BADNAME) -#define nfserr_cb_path_down __constant_htonl(NFSERR_CB_PATH_DOWN) -#define nfserr_locked __constant_htonl(NFSERR_LOCKED) -#define nfserr_wrongsec __constant_htonl(NFSERR_WRONGSEC) -#define nfserr_replay_me __constant_htonl(NFSERR_REPLAY_ME) +#define nfs_ok cpu_to_be32(NFS_OK) +#define nfserr_perm cpu_to_be32(NFSERR_PERM) +#define nfserr_noent cpu_to_be32(NFSERR_NOENT) +#define nfserr_io cpu_to_be32(NFSERR_IO) +#define nfserr_nxio cpu_to_be32(NFSERR_NXIO) +#define nfserr_eagain cpu_to_be32(NFSERR_EAGAIN) +#define nfserr_acces cpu_to_be32(NFSERR_ACCES) +#define nfserr_exist cpu_to_be32(NFSERR_EXIST) +#define nfserr_xdev cpu_to_be32(NFSERR_XDEV) +#define nfserr_nodev cpu_to_be32(NFSERR_NODEV) +#define nfserr_notdir cpu_to_be32(NFSERR_NOTDIR) +#define nfserr_isdir cpu_to_be32(NFSERR_ISDIR) +#define nfserr_inval cpu_to_be32(NFSERR_INVAL) +#define nfserr_fbig cpu_to_be32(NFSERR_FBIG) +#define nfserr_nospc cpu_to_be32(NFSERR_NOSPC) +#define nfserr_rofs cpu_to_be32(NFSERR_ROFS) +#define nfserr_mlink cpu_to_be32(NFSERR_MLINK) +#define nfserr_opnotsupp cpu_to_be32(NFSERR_OPNOTSUPP) +#define nfserr_nametoolong cpu_to_be32(NFSERR_NAMETOOLONG) +#define nfserr_notempty cpu_to_be32(NFSERR_NOTEMPTY) +#define nfserr_dquot cpu_to_be32(NFSERR_DQUOT) +#define nfserr_stale cpu_to_be32(NFSERR_STALE) +#define nfserr_remote cpu_to_be32(NFSERR_REMOTE) +#define nfserr_wflush cpu_to_be32(NFSERR_WFLUSH) +#define nfserr_badhandle cpu_to_be32(NFSERR_BADHANDLE) +#define nfserr_notsync cpu_to_be32(NFSERR_NOT_SYNC) +#define nfserr_badcookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_notsupp cpu_to_be32(NFSERR_NOTSUPP) +#define nfserr_toosmall cpu_to_be32(NFSERR_TOOSMALL) +#define nfserr_serverfault cpu_to_be32(NFSERR_SERVERFAULT) +#define nfserr_badtype cpu_to_be32(NFSERR_BADTYPE) +#define nfserr_jukebox cpu_to_be32(NFSERR_JUKEBOX) +#define nfserr_denied cpu_to_be32(NFSERR_DENIED) +#define nfserr_deadlock cpu_to_be32(NFSERR_DEADLOCK) +#define nfserr_expired cpu_to_be32(NFSERR_EXPIRED) +#define nfserr_bad_cookie cpu_to_be32(NFSERR_BAD_COOKIE) +#define nfserr_same cpu_to_be32(NFSERR_SAME) +#define nfserr_clid_inuse cpu_to_be32(NFSERR_CLID_INUSE) +#define nfserr_stale_clientid cpu_to_be32(NFSERR_STALE_CLIENTID) +#define nfserr_resource cpu_to_be32(NFSERR_RESOURCE) +#define nfserr_moved cpu_to_be32(NFSERR_MOVED) +#define nfserr_nofilehandle cpu_to_be32(NFSERR_NOFILEHANDLE) +#define nfserr_minor_vers_mismatch cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH) +#define nfserr_share_denied cpu_to_be32(NFSERR_SHARE_DENIED) +#define nfserr_stale_stateid cpu_to_be32(NFSERR_STALE_STATEID) +#define nfserr_old_stateid cpu_to_be32(NFSERR_OLD_STATEID) +#define nfserr_bad_stateid cpu_to_be32(NFSERR_BAD_STATEID) +#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) +#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) +#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) +#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) +#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) +#define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE) +#define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD) +#define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL) +#define nfserr_grace cpu_to_be32(NFSERR_GRACE) +#define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) +#define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) +#define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) +#define nfserr_locked cpu_to_be32(NFSERR_LOCKED) +#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) +#define nfserr_replay_me cpu_to_be32(NFSERR_REPLAY_ME) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. * Client should resend eventually */ -#define nfserr_dropit __constant_htonl(30000) +#define nfserr_dropit cpu_to_be32(30000) /* end-of-file indicator in readdir */ -#define nfserr_eof __constant_htonl(30001) +#define nfserr_eof cpu_to_be32(30001) /* Check for dir entries '.' and '..' */ #define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 49e1eb454465..d8910b68e1bd 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -69,27 +69,27 @@ struct xdr_buf { * pre-xdr'ed macros. */ -#define xdr_zero __constant_htonl(0) -#define xdr_one __constant_htonl(1) -#define xdr_two __constant_htonl(2) - -#define rpc_success __constant_htonl(RPC_SUCCESS) -#define rpc_prog_unavail __constant_htonl(RPC_PROG_UNAVAIL) -#define rpc_prog_mismatch __constant_htonl(RPC_PROG_MISMATCH) -#define rpc_proc_unavail __constant_htonl(RPC_PROC_UNAVAIL) -#define rpc_garbage_args __constant_htonl(RPC_GARBAGE_ARGS) -#define rpc_system_err __constant_htonl(RPC_SYSTEM_ERR) -#define rpc_drop_reply __constant_htonl(RPC_DROP_REPLY) - -#define rpc_auth_ok __constant_htonl(RPC_AUTH_OK) -#define rpc_autherr_badcred __constant_htonl(RPC_AUTH_BADCRED) -#define rpc_autherr_rejectedcred __constant_htonl(RPC_AUTH_REJECTEDCRED) -#define rpc_autherr_badverf __constant_htonl(RPC_AUTH_BADVERF) -#define rpc_autherr_rejectedverf __constant_htonl(RPC_AUTH_REJECTEDVERF) -#define rpc_autherr_tooweak __constant_htonl(RPC_AUTH_TOOWEAK) -#define rpcsec_gsserr_credproblem __constant_htonl(RPCSEC_GSS_CREDPROBLEM) -#define rpcsec_gsserr_ctxproblem __constant_htonl(RPCSEC_GSS_CTXPROBLEM) -#define rpc_autherr_oldseqnum __constant_htonl(101) +#define xdr_zero cpu_to_be32(0) +#define xdr_one cpu_to_be32(1) +#define xdr_two cpu_to_be32(2) + +#define rpc_success cpu_to_be32(RPC_SUCCESS) +#define rpc_prog_unavail cpu_to_be32(RPC_PROG_UNAVAIL) +#define rpc_prog_mismatch cpu_to_be32(RPC_PROG_MISMATCH) +#define rpc_proc_unavail cpu_to_be32(RPC_PROC_UNAVAIL) +#define rpc_garbage_args cpu_to_be32(RPC_GARBAGE_ARGS) +#define rpc_system_err cpu_to_be32(RPC_SYSTEM_ERR) +#define rpc_drop_reply cpu_to_be32(RPC_DROP_REPLY) + +#define rpc_auth_ok cpu_to_be32(RPC_AUTH_OK) +#define rpc_autherr_badcred cpu_to_be32(RPC_AUTH_BADCRED) +#define rpc_autherr_rejectedcred cpu_to_be32(RPC_AUTH_REJECTEDCRED) +#define rpc_autherr_badverf cpu_to_be32(RPC_AUTH_BADVERF) +#define rpc_autherr_rejectedverf cpu_to_be32(RPC_AUTH_REJECTEDVERF) +#define rpc_autherr_tooweak cpu_to_be32(RPC_AUTH_TOOWEAK) +#define rpcsec_gsserr_credproblem cpu_to_be32(RPCSEC_GSS_CREDPROBLEM) +#define rpcsec_gsserr_ctxproblem cpu_to_be32(RPCSEC_GSS_CTXPROBLEM) +#define rpc_autherr_oldseqnum cpu_to_be32(101) /* * Miscellaneous XDR helper functions -- cgit v1.2.3-59-g8ed1b From a4455be0850009f5da9a3b82523079922cd4b26e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 10:40:22 -0800 Subject: nfsd4: trivial preprocess_stateid_op cleanup Remove a couple redundant comments, adjust style; no change in behavior. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7f616e928a57..b7e2f251ea95 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2072,21 +2072,21 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - /* STALE STATEID */ status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) goto out; - /* BAD STATEID */ status = nfserr_bad_stateid; if (!stateid->si_fileid) { /* delegation stateid */ - if(!(dp = find_delegation_stateid(ino, stateid))) { + dp = find_delegation_stateid(ino, stateid); + if (!dp) { dprintk("NFSD: delegation stateid not found\n"); goto out; } stidp = &dp->dl_stateid; } else { /* open or lock stateid */ - if (!(stp = find_stateid(stateid, flags))) { + stp = find_stateid(stateid, flags); + if (!stp) { dprintk("NFSD: open or lock stateid not found\n"); goto out; } @@ -2100,13 +2100,15 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl if (status) goto out; if (stp) { - if ((status = nfs4_check_openmode(stp,flags))) + status = nfs4_check_openmode(stp, flags); + if (status) goto out; renew_client(stp->st_stateowner->so_client); if (filpp) *filpp = stp->st_vfs_file; } else { - if ((status = nfs4_check_delegmode(dp, flags))) + status = nfs4_check_delegmode(dp, flags); + if (status) goto out; renew_client(dp->dl_client); if (flags & DELEG_RET) -- cgit v1.2.3-59-g8ed1b From 0c2a498fa6d33d8ca9c8a0c29039c41e1734cb9e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 11:11:50 -0800 Subject: nfsd4: move check_stateid_generation check No change in behavior. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b7e2f251ea95..d6ca2be306dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2084,6 +2084,9 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl goto out; } stidp = &dp->dl_stateid; + status = check_stateid_generation(stateid, stidp); + if (status) + goto out; } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) { @@ -2095,10 +2098,10 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl if (!stp->st_stateowner->so_confirmed) goto out; stidp = &stp->st_stateid; + status = check_stateid_generation(stateid, stidp); + if (status) + goto out; } - status = check_stateid_generation(stateid, stidp); - if (status) - goto out; if (stp) { status = nfs4_check_openmode(stp, flags); if (status) -- cgit v1.2.3-59-g8ed1b From dc9bf700ed2fc3eab50f31000b13fda781e7c9f1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 11:14:43 -0800 Subject: nfsd4: remove redundant "if" in nfs4_preprocess_stateid_op Note that we exit this first big "if" with stp == NULL if and only if we took the first branch; therefore, the second "if" is redundant, and we can just combine the two, simplifying the logic. Reviewed-by: Yang Hongyang Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d6ca2be306dc..16fcb656f47f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2087,6 +2087,14 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl status = check_stateid_generation(stateid, stidp); if (status) goto out; + status = nfs4_check_delegmode(dp, flags); + if (status) + goto out; + renew_client(dp->dl_client); + if (flags & DELEG_RET) + unhash_delegation(dp); + if (filpp) + *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) { @@ -2101,23 +2109,12 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl status = check_stateid_generation(stateid, stidp); if (status) goto out; - } - if (stp) { status = nfs4_check_openmode(stp, flags); if (status) goto out; renew_client(stp->st_stateowner->so_client); if (filpp) *filpp = stp->st_vfs_file; - } else { - status = nfs4_check_delegmode(dp, flags); - if (status) - goto out; - renew_client(dp->dl_client); - if (flags & DELEG_RET) - unhash_delegation(dp); - if (filpp) - *filpp = dp->dl_vfs_file; } status = nfs_ok; out: -- cgit v1.2.3-59-g8ed1b From fd03b09906c32aea7b47f1275c9cd6034141159d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 11:27:30 -0800 Subject: nfsd4: remove unneeded local variable We no longer need stidp. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 16fcb656f47f..099938ebc9d3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2056,7 +2056,6 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl { struct nfs4_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; - stateid_t *stidp; struct inode *ino = current_fh->fh_dentry->d_inode; __be32 status; @@ -2083,8 +2082,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl dprintk("NFSD: delegation stateid not found\n"); goto out; } - stidp = &dp->dl_stateid; - status = check_stateid_generation(stateid, stidp); + status = check_stateid_generation(stateid, &dp->dl_stateid); if (status) goto out; status = nfs4_check_delegmode(dp, flags); @@ -2105,8 +2103,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl goto out; if (!stp->st_stateowner->so_confirmed) goto out; - stidp = &stp->st_stateid; - status = check_stateid_generation(stateid, stidp); + status = check_stateid_generation(stateid, &stp->st_stateid); if (status) goto out; status = nfs4_check_openmode(stp, flags); -- cgit v1.2.3-59-g8ed1b From 819a8f539acf7838d62fec20e88401ff53303cd1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 12:13:24 -0800 Subject: nfsd4: remove some dprintk's I can't recall ever seeing these printk's used to debug a problem. I'll happily put them back if we see a case where they'd be useful. (Though if we do that the find_XXX() errors would probably be better reported in find_XXX() functions themselves.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 099938ebc9d3..909a7a493688 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2059,9 +2059,6 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl struct inode *ino = current_fh->fh_dentry->d_inode; __be32 status; - dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); if (filpp) *filpp = NULL; @@ -2078,10 +2075,8 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl status = nfserr_bad_stateid; if (!stateid->si_fileid) { /* delegation stateid */ dp = find_delegation_stateid(ino, stateid); - if (!dp) { - dprintk("NFSD: delegation stateid not found\n"); + if (!dp) goto out; - } status = check_stateid_generation(stateid, &dp->dl_stateid); if (status) goto out; @@ -2095,10 +2090,8 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); - if (!stp) { - dprintk("NFSD: open or lock stateid not found\n"); + if (!stp) goto out; - } if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) -- cgit v1.2.3-59-g8ed1b From 3e633079e377d2b527a8390f63ceb887b5cabfbf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 13:17:19 -0800 Subject: nfsd4: add a helper function to decide if stateid is delegation Make this check self-documenting. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 909a7a493688..d5555850cb64 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2048,6 +2048,11 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref) return nfs_ok; } +static int is_delegation_stateid(stateid_t *stateid) +{ + return stateid->si_fileid == 0; +} + /* * Checks for stateid operations */ @@ -2073,7 +2078,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl goto out; status = nfserr_bad_stateid; - if (!stateid->si_fileid) { /* delegation stateid */ + if (is_delegation_stateid(stateid)) { dp = find_delegation_stateid(ino, stateid); if (!dp) goto out; -- cgit v1.2.3-59-g8ed1b From 203a8c8e66278a5936a230edaac29017e50c88fb Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 13:29:14 -0800 Subject: nfsd4: separate delegreturn case from preprocess_stateid_op Delegreturn is enough a special case for preprocess_stateid_op to warrant just open-coding it in delegreturn. There should be no change in behavior here; we're just reshuffling code. Thanks to Yang Hongyang for catching a critical typo. Reviewed-by: Yang Hongyang Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 40 ++++++++++++++++++++++++++++------------ include/linux/nfsd/state.h | 1 - 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index d5555850cb64..3570a0d1133f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2000,10 +2000,7 @@ out: static inline __be32 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) { - /* Trying to call delegreturn with a special stateid? Yuch: */ - if (!(flags & (RD_STATE | WR_STATE))) - return nfserr_bad_stateid; - else if (ONE_STATEID(stateid) && (flags & RD_STATE)) + if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of @@ -2024,8 +2021,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) - && mandatory_lock(inode); + return locks_in_grace() && mandatory_lock(inode); } static int check_stateid_generation(stateid_t *in, stateid_t *ref) @@ -2089,8 +2085,6 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl if (status) goto out; renew_client(dp->dl_client); - if (flags & DELEG_RET) - unhash_delegation(dp); if (filpp) *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ @@ -2408,16 +2402,38 @@ __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *dr) { + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct inode *inode; __be32 status; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) - goto out; + return status; + inode = cstate->current_fh.fh_dentry->d_inode; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &dr->dr_stateid, DELEG_RET, NULL); - nfs4_unlock_state(); + status = nfserr_bad_stateid; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + goto out; + status = nfserr_stale_stateid; + if (STALE_STATEID(stateid)) + goto out; + status = nfs_ok; + if (!is_delegation_stateid(stateid)) + goto out; + status = nfserr_bad_stateid; + dp = find_delegation_stateid(inode, stateid); + if (!dp) + goto out; + status = check_stateid_generation(stateid, &dp->dl_stateid); + if (status) + goto out; + renew_client(dp->dl_client); + + unhash_delegation(dp); out: + nfs4_unlock_state(); + return status; } diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 1130d534bb63..c9311a1e2e1a 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -263,7 +263,6 @@ struct nfs4_stateid { #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 #define CLOSE_STATE 0x00000040 -#define DELEG_RET 0x00000080 #define seqid_mutating_err(err) \ (((err) != nfserr_stale_clientid) && \ -- cgit v1.2.3-59-g8ed1b From 7e0f7cf582abd6c85232331dfe726a4e4b0fd98e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 13:32:28 -0800 Subject: nfsd4: fail when delegreturn gets a non-delegation stateid Previous cleanup reveals an obvious (though harmless) bug: when delegreturn gets a stateid that isn't for a delegation, it should return an error rather than doing nothing. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3570a0d1133f..6ae28e606afc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2418,10 +2418,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) goto out; - status = nfs_ok; + status = nfserr_bad_stateid; if (!is_delegation_stateid(stateid)) goto out; - status = nfserr_bad_stateid; dp = find_delegation_stateid(inode, stateid); if (!dp) goto out; -- cgit v1.2.3-59-g8ed1b From 6150ef0dc7f734366d297e2eb5697ae458a1ea19 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 13:36:16 -0800 Subject: nfsd4: remove unused CHECK_FH flag All users now pass this, so it's meaningless. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 6 +++--- fs/nfsd/nfs4state.c | 2 +- include/linux/nfsd/state.h | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index af66073ed423..77f584f69dfe 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -519,7 +519,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check stateid */ if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, &read->rd_stateid, - CHECK_FH | RD_STATE, &read->rd_filp))) { + RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } @@ -651,7 +651,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); + &setattr->sa_stateid, WR_STATE, NULL); nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); @@ -690,7 +690,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, - CHECK_FH | WR_STATE, &filp); + WR_STATE, &filp); if (filp) get_file(filp); nfs4_unlock_state(); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6ae28e606afc..5957f7766bdc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2091,7 +2091,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl stp = find_stateid(stateid, flags); if (!stp) goto out; - if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) + if (nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) goto out; diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index c9311a1e2e1a..503b6bb53a56 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -256,7 +256,6 @@ struct nfs4_stateid { }; /* flags for preprocess_seqid_op() */ -#define CHECK_FH 0x00000001 #define CONFIRM 0x00000002 #define OPEN_STATE 0x00000004 #define LOCK_STATE 0x00000008 -- cgit v1.2.3-59-g8ed1b From 18f82731b7784ba81ee9b1ed6a8179b577fa898b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 15:23:01 -0800 Subject: nfsd4: rename io_during_grace_disallowed Use a slightly clearer, more concise name. Also removed unused argument. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 5957f7766bdc..89e575e7daea 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2019,7 +2019,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) * that are not able to provide mandatory locking. */ static inline int -io_during_grace_disallowed(struct inode *inode, int flags) +grace_disallows_io(struct inode *inode) { return locks_in_grace() && mandatory_lock(inode); } @@ -2063,7 +2063,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl if (filpp) *filpp = NULL; - if (io_during_grace_disallowed(ino, flags)) + if (grace_disallows_io(ino)) return nfserr_grace; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) -- cgit v1.2.3-59-g8ed1b From d7fdcfe0aaaf6dffca6fa857bab374182fe7ca8b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 21 Feb 2009 15:39:54 -0800 Subject: nfsd4: put_nfs4_client does not require state lock Since free_client() is guaranteed to only be called once, and to only touch the client structure itself (not any common data structures), it has no need for the state lock. Signed-off-by: J. Bruce Fields Cc: Alexandros Batsakis --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3ddc9fb2e358..3fd7136321ca 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -490,8 +490,8 @@ out_put_cred: * Success or failure, now we're either waiting for lease expiration * or deleg_return. */ - nfs4_lock_state(); put_nfs4_client(clp); + nfs4_lock_state(); nfs4_put_delegation(dp); nfs4_unlock_state(); return; -- cgit v1.2.3-59-g8ed1b From 8b671b80707e4fc76adfe4387df07b3be1007c1e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 22 Feb 2009 14:51:34 -0800 Subject: nfsd4: remove use of mutex for file_hashtable As part of reducing the scope of the client_mutex, and in order to remove the need for mutexes from the callback code (so that callbacks can be done as asynchronous rpc calls), move manipulations of the file_hashtable under the recall_lock. Update the relevant comments while we're here. Signed-off-by: J. Bruce Fields Cc: Alexandros Batsakis Reviewed-by: Benny Halevy --- fs/nfsd/nfs4callback.c | 2 -- fs/nfsd/nfs4state.c | 47 +++++++++++++++++++++++----------------------- include/linux/nfsd/state.h | 2 +- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 3fd7136321ca..5dcd38e5f138 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -491,8 +491,6 @@ out_put_cred: * or deleg_return. */ put_nfs4_client(clp); - nfs4_lock_state(); nfs4_put_delegation(dp); - nfs4_unlock_state(); return; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 89e575e7daea..54651aa45790 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -78,14 +78,18 @@ static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, state static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); -/* Locking: - * - * client_mutex: - * protects clientid_hashtbl[], clientstr_hashtbl[], - * unconfstr_hashtbl[], uncofid_hashtbl[]. - */ +/* Locking: */ + +/* Currently used for almost all code touching nfsv4 state: */ static DEFINE_MUTEX(client_mutex); +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(recall_lock); + static struct kmem_cache *stateowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; @@ -116,33 +120,23 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -/* - * Delegation state - */ - -/* recall_lock protects the del_recall_lru */ -static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; -static void -free_nfs4_file(struct kref *kref) -{ - struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref); - list_del(&fp->fi_hash); - iput(fp->fi_inode); - kmem_cache_free(file_slab, fp); -} - static inline void put_nfs4_file(struct nfs4_file *fi) { - kref_put(&fi->fi_ref, free_nfs4_file); + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + list_del(&fi->fi_hash); + spin_unlock(&recall_lock); + iput(fi->fi_inode); + kmem_cache_free(file_slab, fi); + } } static inline void get_nfs4_file(struct nfs4_file *fi) { - kref_get(&fi->fi_ref); + atomic_inc(&fi->fi_ref); } static int num_delegations; @@ -1000,11 +994,13 @@ alloc_init_file(struct inode *ino) fp = kmem_cache_alloc(file_slab, GFP_KERNEL); if (fp) { - kref_init(&fp->fi_ref); + atomic_set(&fp->fi_ref, 1); INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; @@ -1177,12 +1173,15 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; + spin_lock(&recall_lock); list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); + spin_unlock(&recall_lock); return fp; } } + spin_unlock(&recall_lock); return NULL; } diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 503b6bb53a56..a6e4a00fa392 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -214,7 +214,7 @@ struct nfs4_stateowner { * share_acces, share_deny on the file. */ struct nfs4_file { - struct kref fi_ref; + atomic_t fi_ref; struct list_head fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; -- cgit v1.2.3-59-g8ed1b From 418cd20aa19b54554cab383e2fd0d1cb8c4732ee Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 22 Feb 2009 15:52:13 -0800 Subject: nfsd4: fix do_probe_callback errors The errors returned aren't used. Just return 0 and make them available to a dprintk(). Also, consistently use -ERRNO errors instead of nfs errors. Signed-off-by: J. Bruce Fields Reviewed-by: Benny Halevy --- fs/nfsd/nfs4callback.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 5dcd38e5f138..8d55f5047503 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -389,12 +389,10 @@ static int do_probe_callback(void *data) .rpc_argp = clp, }; struct rpc_clnt *client; - int status; + int status = -EINVAL; - if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { - status = nfserr_cb_path_down; + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) goto out_err; - } /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -405,8 +403,9 @@ static int do_probe_callback(void *data) /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { - dprintk("NFSD: couldn't create callback client\n"); status = PTR_ERR(client); + dprintk("NFSD: couldn't create callback client: %d\n", + status); goto out_err; } @@ -422,10 +421,10 @@ static int do_probe_callback(void *data) out_release_client: rpc_shutdown_client(client); out_err: - dprintk("NFSD: warning: no callback path to client %.*s\n", - (int)clp->cl_name.len, clp->cl_name.data); + dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", + (int)clp->cl_name.len, clp->cl_name.data, status); put_nfs4_client(clp); - return status; + return 0; } /* -- cgit v1.2.3-59-g8ed1b From a601caeda21c0e94c153dbd146ec0899cc5f324f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 22 Feb 2009 16:43:45 -0800 Subject: nfsd4: move rpc_client setup to a separate function Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 8d55f5047503..290289bd44f7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -361,9 +361,8 @@ static struct rpc_program cb_program = { /* Reference counting, callback cleanup, etc., all look racy as heck. * And why is cb_set an atomic? */ -static int do_probe_callback(void *data) +static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp) { - struct nfs4_client *clp = data; struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; struct rpc_timeout timeparms = { @@ -384,15 +383,10 @@ static int do_probe_callback(void *data) .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .client_name = clp->cl_principal, }; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; struct rpc_clnt *client; - int status = -EINVAL; if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) - goto out_err; + return ERR_PTR(-EINVAL); /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -402,6 +396,25 @@ static int do_probe_callback(void *data) /* Create RPC client */ client = rpc_create(&args); + if (IS_ERR(client)) + dprintk("NFSD: couldn't create callback client: %ld\n", + PTR_ERR(client)); + return client; + +} + +static int do_probe_callback(void *data) +{ + struct nfs4_client *clp = data; + struct nfs4_callback *cb = &clp->cl_callback; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + struct rpc_clnt *client; + int status; + + client = setup_callback_client(clp); if (IS_ERR(client)) { status = PTR_ERR(client); dprintk("NFSD: couldn't create callback client: %d\n", -- cgit v1.2.3-59-g8ed1b From 1e685ec270cb97680be4eb8cf6b615f5f7f1403a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 4 Mar 2009 23:06:06 +0200 Subject: NFSD: return nfsv4 error code nfserr_notsupp rather than nfsv[23]'s nfserr_opnotsupp Thanks for Bill Baker at sun.com for catching this at Connectathon 2009. This bug was introduced in 2.6.27 Signed-off-by: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9250067943d8..aaf5f234df25 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1005,7 +1005,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) static __be32 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) { - return nfserr_opnotsupp; + return nfserr_notsupp; } typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); -- cgit v1.2.3-59-g8ed1b From 31dec2538e45e9fff2007ea1f4c6bae9f78db724 Mon Sep 17 00:00:00 2001 From: David Shaw Date: Thu, 5 Mar 2009 20:16:14 -0500 Subject: Short write in nfsd becomes a full write to the client If a filesystem being written to via NFS returns a short write count (as opposed to an error) to nfsd, nfsd treats that as a success for the entire write, rather than the short count that actually succeeded. For example, given a 8192 byte write, if the underlying filesystem only writes 4096 bytes, nfsd will ack back to the nfs client that all 8192 bytes were written. The nfs client does have retry logic for short writes, but this is never called as the client is told the complete write succeeded. There are probably other ways it could happen, but in my case it happened with a fuse (filesystem in userspace) filesystem which can rather easily have a partial write. Here is a patch to properly return the short write count to the client. Signed-off-by: David Shaw Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3proc.c | 5 +++-- fs/nfsd/nfs4proc.c | 7 +++++-- fs/nfsd/nfsproc.c | 3 ++- fs/nfsd/vfs.c | 13 +++++++------ include/linux/nfsd/nfsd.h | 2 +- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 579ce8c69daa..7c9fe838f038 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -203,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, struct nfsd3_writeres *resp) { __be32 nfserr; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", SVCFH_fmt(&argp->fh), @@ -215,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, nfserr = nfsd_write(rqstp, &resp->fh, NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &resp->committed); - resp->count = argp->count; + resp->count = cnt; RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 77f584f69dfe..283d77a47120 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -682,6 +682,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file *filp = NULL; u32 *p; __be32 status = nfs_ok; + unsigned long cnt; /* no need to check permission - this will be done in nfsd_write() */ @@ -700,7 +701,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - write->wr_bytes_written = write->wr_buflen; + cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; p = (u32 *)write->wr_verifier.data; *p++ = nfssvc_boot.tv_sec; @@ -708,10 +709,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, write->wr_vlen, - write->wr_buflen, &write->wr_how_written); + &cnt, &write->wr_how_written); if (filp) fput(filp); + write->wr_bytes_written = cnt; + if (status == nfserr_symlink) status = nfserr_inval; return status; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6f7f26351227..e298e260b5f1 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, { __be32 nfserr; int stable = 1; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &stable); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0c076293155d..54404d730809 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -960,7 +960,7 @@ static void kill_suid(struct dentry *dentry) static __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long cnt, int *stablep) + unsigned long *cnt, int *stablep) { struct svc_export *exp; struct dentry *dentry; @@ -974,7 +974,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) + (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) goto out; #endif @@ -1006,7 +1006,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { - nfsdstats.io_write += cnt; + nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@ -1051,9 +1051,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) + if (host_err >= 0) { err = 0; - else + *cnt = host_err; + } else err = nfserrno(host_err); out: return err; @@ -1095,7 +1096,7 @@ out: */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) { __be32 err = 0; diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 16f7b403d9c1..54beda12d26b 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -105,7 +105,7 @@ void nfsd_close(struct file *); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, - loff_t, struct kvec *,int, unsigned long, int *); + loff_t, struct kvec *,int, unsigned long *, int *); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, -- cgit v1.2.3-59-g8ed1b From a1c8c4d1ff54c6c86930ee3c4c73c69eeb9ede61 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 9 Mar 2009 12:17:29 -0400 Subject: nfsd4: support putpubfh operation Currently putpubfh returns NFSERR_OPNOTSUPP, which isn't actually allowed for v4. The right error is probably NFSERR_NOTSUPP. But let's just implement it; though rarely seen, it can be used by Solaris (with a special mount option), is mandated by the rfc, and is trivial for us to support. Thanks to Yang Hongyang for pointing out the original problem, and to Mike Eisler, Tom Talpey, Trond Myklebust, and Dave Noveck for further argument.... Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4xdr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 283d77a47120..36d74cda87a8 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1045,7 +1045,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { - /* unsupported, just for future reference: */ + .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_PUTPUBFH", }, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index aaf5f234df25..76a0b2a8d69b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1031,7 +1031,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_READ] = (nfsd4_dec)nfsd4_decode_read, [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, -- cgit v1.2.3-59-g8ed1b From 05f4f678b0511a24795a017b5332455077be3b1c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 13 Mar 2009 16:02:59 -0400 Subject: nfsd4: don't do lookup within readdir in recovery code The main nfsd code was recently modified to no longer do lookups from withing the readdir callback, to avoid locking problems on certain filesystems. This (rather hacky, and overdue for replacement) NFSv4 recovery code has the same problem. Fix it to build up a list of names (instead of dentries) and do the lookups afterwards. Reported symptoms were a deadlock in the xfs code (called from nfsd4_recdir_load), with /var/lib/nfs on xfs. Signed-off-by: J. Bruce Fields Reported-by: David Warren --- fs/nfsd/nfs4recover.c | 71 ++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 74f7b67567fd..b11cf8d34280 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -182,36 +182,26 @@ out_unlock: typedef int (recdir_func)(struct dentry *, struct dentry *); -struct dentry_list { - struct dentry *dentry; +struct name_list { + char name[HEXDIR_LEN]; struct list_head list; }; -struct dentry_list_arg { - struct list_head dentries; - struct dentry *parent; -}; - static int -nfsd4_build_dentrylist(void *arg, const char *name, int namlen, +nfsd4_build_namelist(void *arg, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct dentry_list_arg *dla = arg; - struct list_head *dentries = &dla->dentries; - struct dentry *parent = dla->parent; - struct dentry *dentry; - struct dentry_list *child; + struct list_head *names = arg; + struct name_list *entry; - if (name && isdotent(name, namlen)) + if (namlen != HEXDIR_LEN - 1) return 0; - dentry = lookup_one_len(name, parent, namlen); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - child = kmalloc(sizeof(*child), GFP_KERNEL); - if (child == NULL) + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) return -ENOMEM; - child->dentry = dentry; - list_add(&child->list, dentries); + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, names); return 0; } @@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) { const struct cred *original_cred; struct file *filp; - struct dentry_list_arg dla = { - .parent = dir, - }; - struct list_head *dentries = &dla.dentries; - struct dentry_list *child; + LIST_HEAD(names); + struct name_list *entry; + struct dentry *dentry; int status; if (!rec_dir_init) @@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) status = nfs4_save_creds(&original_cred); if (status < 0) return status; - INIT_LIST_HEAD(dentries); filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) goto out; - INIT_LIST_HEAD(dentries); - status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla); + status = vfs_readdir(filp, nfsd4_build_namelist, &names); fput(filp); - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - status = f(dir, child->dentry); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out; + } + status = f(dir, dentry); + dput(dentry); if (status) goto out; - list_del(&child->list); - dput(child->dentry); - kfree(child); + list_del(&entry->list); + kfree(entry); } out: - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - list_del(&child->list); - dput(child->dentry); - kfree(child); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + list_del(&entry->list); + kfree(entry); } nfs4_reset_creds(original_cred); return status; -- cgit v1.2.3-59-g8ed1b From 5cb031b0afddad73ea4191c9f0b76d20ca447dc0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 14 Mar 2009 16:38:41 -0400 Subject: nfsd4: remove redundant check from nfsd4_open Note that we already checked for this invalid case at the top of this function. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 36d74cda87a8..f156b85b4129 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -206,10 +206,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfserr_inval; - if (open->op_create) - goto out; - /* fall through */ case NFS4_OPEN_CLAIM_NULL: /* * (1) set CURRENT_FH to the file being opened, -- cgit v1.2.3-59-g8ed1b From 8bbfa9f3889b643fc7de82c0c761ef17097f8faf Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Tue, 13 Jan 2009 21:26:34 +1100 Subject: knfsd: remove the nfsd thread busy histogram Stop gathering the data that feeds the 'th' line in /proc/net/rpc/nfsd because the questionable data provided is not worth the scalability impact of calculating it. Instead, always report zeroes. The current approach suffers from three major issues: 1. update_thread_usage() increments buckets by call service time or call arrival time...in jiffies. On lightly loaded machines, call service times are usually < 1 jiffy; on heavily loaded machines call arrival times will be << 1 jiffy. So a large portion of the updates to the buckets are rounded down to zero, and the histogram is undercounting. 2. As seen previously on the nfs mailing list, the format in which the histogram is presented is cryptic, difficult to explain, and difficult to use. 3. Updating the histogram requires taking a global spinlock and dirtying the global variables nfsd_last_call, nfsd_busy, and nfsdstats *twice* on every RPC call, which is a significant scaling limitation. Testing on a 4 CPU 4 NIC Altix using 4 IRIX clients each doing 1K streaming reads at full line rate, shows the stats update code (inlined into nfsd()) takes about 1.7% of each CPU. This patch drops the contribution from nfsd() into the profile noise. This patch is a forward-ported version of knfsd-remove-nfsd-threadstats which has been shipping in the SGI "Enhanced NFS" product since 2006. In that time, exactly one customer has noticed that the threadstats were missing. It has been previously posted: http://article.gmane.org/gmane.linux.nfs/10376 and more recently requested to be posted again. Signed-off-by: Greg Banks Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 07e4f5d7baa8..c3eb0759fd57 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -40,9 +40,6 @@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); struct timeval nfssvc_boot; -static atomic_t nfsd_busy; -static unsigned long nfsd_last_call; -static DEFINE_SPINLOCK(nfsd_call_lock); /* * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members @@ -227,7 +224,6 @@ int nfsd_create_serv(void) nfsd_max_blksize /= 2; } - atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); @@ -376,26 +372,6 @@ nfsd_svc(unsigned short port, int nrservs) return error; } -static inline void -update_thread_usage(int busy_threads) -{ - unsigned long prev_call; - unsigned long diff; - int decile; - - spin_lock(&nfsd_call_lock); - prev_call = nfsd_last_call; - nfsd_last_call = jiffies; - decile = busy_threads*10/nfsdstats.th_cnt; - if (decile>0 && decile <= 10) { - diff = nfsd_last_call - prev_call; - if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP) - nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP; - if (decile == 10) - nfsdstats.th_fullcnt++; - } - spin_unlock(&nfsd_call_lock); -} /* * This is the NFS server kernel thread @@ -464,8 +440,6 @@ nfsd(void *vrqstp) continue; } - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); @@ -474,8 +448,6 @@ nfsd(void *vrqstp) /* Unlock export hash tables */ exp_readunlock(); - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_dec(&nfsd_busy); } /* Clear signals before calling svc_exit_thread() */ -- cgit v1.2.3-59-g8ed1b From 59a252ff8c0f2fa32c896f69d56ae33e641ce7ad Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Tue, 13 Jan 2009 21:26:35 +1100 Subject: knfsd: avoid overloading the CPU scheduler with enormous load averages Avoid overloading the CPU scheduler with enormous load averages when handling high call-rate NFS loads. When the knfsd bottom half is made aware of an incoming call by the socket layer, it tries to choose an nfsd thread and wake it up. As long as there are idle threads, one will be woken up. If there are lot of nfsd threads (a sensible configuration when the server is disk-bound or is running an HSM), there will be many more nfsd threads than CPUs to run them. Under a high call-rate low service-time workload, the result is that almost every nfsd is runnable, but only a handful are actually able to run. This situation causes two significant problems: 1. The CPU scheduler takes over 10% of each CPU, which is robbing the nfsd threads of valuable CPU time. 2. At a high enough load, the nfsd threads starve userspace threads of CPU time, to the point where daemons like portmap and rpc.mountd do not schedule for tens of seconds at a time. Clients attempting to mount an NFS filesystem timeout at the very first step (opening a TCP connection to portmap) because portmap cannot wake up from select() and call accept() in time. Disclaimer: these effects were observed on a SLES9 kernel, modern kernels' schedulers may behave more gracefully. The solution is simple: keep in each svc_pool a counter of the number of threads which have been woken but have not yet run, and do not wake any more if that count reaches an arbitrary small threshold. Testing was on a 4 CPU 4 NIC Altix using 4 IRIX clients, each with 16 synthetic client threads simulating an rsync (i.e. recursive directory listing) workload reading from an i386 RH9 install image (161480 regular files in 10841 directories) on the server. That tree is small enough to fill in the server's RAM so no disk traffic was involved. This setup gives a sustained call rate in excess of 60000 calls/sec before being CPU-bound on the server. The server was running 128 nfsds. Profiling showed schedule() taking 6.7% of every CPU, and __wake_up() taking 5.2%. This patch drops those contributions to 3.0% and 2.2%. Load average was over 120 before the patch, and 20.9 after. This patch is a forward-ported version of knfsd-avoid-nfsd-overload which has been shipping in the SGI "Enhanced NFS" product since 2006. It has been posted before: http://article.gmane.org/gmane.linux.nfs/10374 Signed-off-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 ++ net/sunrpc/svc_xprt.c | 25 ++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3435d24bfe55..39ec186a492d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -41,6 +41,7 @@ struct svc_pool { struct list_head sp_sockets; /* pending sockets */ unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ + int sp_nwaking; /* number of threads woken but not yet active */ } ____cacheline_aligned_in_smp; /* @@ -264,6 +265,7 @@ struct svc_rqst { * cache pages */ wait_queue_head_t rq_wait; /* synchronization */ struct task_struct *rq_task; /* service thread */ + int rq_waking; /* 1 if thread is being woken */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e588df5d6b34..0551b6b6cf8c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -14,6 +14,8 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +#define SVC_MAX_WAKING 5 + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@ -298,6 +300,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; + int thread_avail; if (!(xprt->xpt_flags & ((1<sp_lock); - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_xprt_enqueue: " - "threads and transports both waiting??\n"); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { /* Don't enqueue dead transports */ dprintk("svc: transport %p is dead, not enqueued\n", xprt); @@ -353,7 +350,14 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) } process: - if (!list_empty(&pool->sp_threads)) { + /* Work out whether threads are available */ + thread_avail = !list_empty(&pool->sp_threads); /* threads are asleep */ + if (pool->sp_nwaking >= SVC_MAX_WAKING) { + /* too many threads are runnable and trying to wake up */ + thread_avail = 0; + } + + if (thread_avail) { rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); @@ -368,6 +372,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + rqstp->rq_waking = 1; + pool->sp_nwaking++; BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { @@ -633,6 +639,11 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) return -EINTR; spin_lock_bh(&pool->sp_lock); + if (rqstp->rq_waking) { + rqstp->rq_waking = 0; + pool->sp_nwaking--; + BUG_ON(pool->sp_nwaking < 0); + } xprt = svc_xprt_dequeue(pool); if (xprt) { rqstp->rq_xprt = xprt; -- cgit v1.2.3-59-g8ed1b From 03cf6c9f49a8fea953d38648d016e3f46e814991 Mon Sep 17 00:00:00 2001 From: Greg Banks Date: Tue, 13 Jan 2009 21:26:36 +1100 Subject: knfsd: add file to export stats about nfsd pools Add /proc/fs/nfsd/pool_stats to export to userspace various statistics about the operation of rpc server thread pools. This patch is based on a forward-ported version of knfsd-add-pool-thread-stats which has been shipping in the SGI "Enhanced NFS" product since 2006 and which was previously posted: http://article.gmane.org/gmane.linux.nfs/10375 It has also been updated thus: * moved EXPORT_SYMBOL() to near the function it exports * made the new struct struct seq_operations const * used SEQ_START_TOKEN instead of ((void *)1) * merged fix from SGI PV 990526 "sunrpc: use dprintk instead of printk in svc_pool_stats_*()" by Harshula Jayasuriya. * merged fix from SGI PV 964001 "Crash reading pool_stats before nfsds are started". Signed-off-by: Greg Banks Signed-off-by: Harshula Jayasuriya Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 12 ++++++ fs/nfsd/nfssvc.c | 7 ++++ include/linux/sunrpc/svc.h | 11 +++++ net/sunrpc/svc_xprt.c | 100 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 129 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3d93b2064ce5..4adebb6312c4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -60,6 +60,7 @@ enum { NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, + NFSD_Pool_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@ -172,6 +173,16 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); + +static struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -1246,6 +1257,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index c3eb0759fd57..ef0a3686639d 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -546,3 +546,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + if (nfsd_serv == NULL) + return -ENODEV; + return svc_pool_stats_open(nfsd_serv, file); +} diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 39ec186a492d..9f9f699dd469 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -24,6 +24,15 @@ */ typedef int (*svc_thread_fn)(void *); +/* statistics for svc_pool structures */ +struct svc_pool_stats { + unsigned long packets; + unsigned long sockets_queued; + unsigned long threads_woken; + unsigned long overloads_avoided; + unsigned long threads_timedout; +}; + /* * * RPC service thread pool. @@ -42,6 +51,7 @@ struct svc_pool { unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ int sp_nwaking; /* number of threads woken but not yet active */ + struct svc_pool_stats sp_stats; /* statistics on pool operation */ } ____cacheline_aligned_in_smp; /* @@ -396,6 +406,7 @@ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, sa_family_t, void (*shutdown)(struct svc_serv *), svc_thread_fn, struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); +int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); int svc_register(const struct svc_serv *, const unsigned short, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 0551b6b6cf8c..1e66f2491460 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -318,6 +318,8 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) goto out_unlock; } + pool->sp_stats.packets++; + /* Mark transport as busy. It will remain in this state until * the provider calls svc_xprt_received. We update XPT_BUSY * atomically because it also guards against trying to enqueue @@ -355,6 +357,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (pool->sp_nwaking >= SVC_MAX_WAKING) { /* too many threads are runnable and trying to wake up */ thread_avail = 0; + pool->sp_stats.overloads_avoided++; } if (thread_avail) { @@ -374,11 +377,13 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); rqstp->rq_waking = 1; pool->sp_nwaking++; + pool->sp_stats.threads_woken++; BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: transport %p put into queue\n", xprt); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + pool->sp_stats.sockets_queued++; BUG_ON(xprt->xpt_pool != pool); } @@ -591,6 +596,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) int pages; struct xdr_buf *arg; DECLARE_WAITQUEUE(wait, current); + long time_left; dprintk("svc: server %p waiting for data (to = %ld)\n", rqstp, timeout); @@ -676,12 +682,14 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) add_wait_queue(&rqstp->rq_wait, &wait); spin_unlock_bh(&pool->sp_lock); - schedule_timeout(timeout); + time_left = schedule_timeout(timeout); try_to_freeze(); spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); + if (!time_left) + pool->sp_stats.threads_timedout++; xprt = rqstp->rq_xprt; if (!xprt) { @@ -1114,3 +1122,93 @@ int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen) return totlen; } EXPORT_SYMBOL_GPL(svc_xprt_names); + + +/*----------------------------------------------------------------------------*/ + +static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) +{ + unsigned int pidx = (unsigned int)*pos; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + + lock_kernel(); + /* bump up the pseudo refcount while traversing */ + svc_get(serv); + unlock_kernel(); + + if (!pidx) + return SEQ_START_TOKEN; + return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); +} + +static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct svc_pool *pool = p; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); + + if (p == SEQ_START_TOKEN) { + pool = &serv->sv_pools[0]; + } else { + unsigned int pidx = (pool - &serv->sv_pools[0]); + if (pidx < serv->sv_nrpools-1) + pool = &serv->sv_pools[pidx+1]; + else + pool = NULL; + } + ++*pos; + return pool; +} + +static void svc_pool_stats_stop(struct seq_file *m, void *p) +{ + struct svc_serv *serv = m->private; + + lock_kernel(); + /* this function really, really should have been called svc_put() */ + svc_destroy(serv); + unlock_kernel(); +} + +static int svc_pool_stats_show(struct seq_file *m, void *p) +{ + struct svc_pool *pool = p; + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken overloads-avoided threads-timedout\n"); + return 0; + } + + seq_printf(m, "%u %lu %lu %lu %lu %lu\n", + pool->sp_id, + pool->sp_stats.packets, + pool->sp_stats.sockets_queued, + pool->sp_stats.threads_woken, + pool->sp_stats.overloads_avoided, + pool->sp_stats.threads_timedout); + + return 0; +} + +static const struct seq_operations svc_pool_stats_seq_ops = { + .start = svc_pool_stats_start, + .next = svc_pool_stats_next, + .stop = svc_pool_stats_stop, + .show = svc_pool_stats_show, +}; + +int svc_pool_stats_open(struct svc_serv *serv, struct file *file) +{ + int err; + + err = seq_open(file, &svc_pool_stats_seq_ops); + if (!err) + ((struct seq_file *) file->private_data)->private = serv; + return err; +} +EXPORT_SYMBOL(svc_pool_stats_open); + +/*----------------------------------------------------------------------------*/ -- cgit v1.2.3-59-g8ed1b From 026722c25e6eb018eab8b9a3c198c258f5b7a2e7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 18 Mar 2009 15:06:26 -0400 Subject: nfsd4: don't check ip address in setclientid The spec allows clients to change ip address, so we shouldn't be requiring that setclientid always come from the same address. For example, a client could reboot and get a new dhcpd address, but still present the same clientid to the server. In that case the server should revoke the client's previous state and allow it to continue, instead of (as it currently does) returning a CLID_INUSE error. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 54651aa45790..070e9e5c0452 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -791,10 +791,9 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (conf) { /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; - if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) - || conf->cl_addr != sin->sin_addr.s_addr) { - dprintk("NFSD: setclientid: string in use by clientat %pI4\n", - &conf->cl_addr); + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + dprintk("NFSD: setclientid: string in use by client" + " at %pI4\n", &conf->cl_addr); goto out; } } -- cgit v1.2.3-59-g8ed1b From 47a14ef1af48c696b214ac168f056ddc79793d0e Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 21 Oct 2008 14:13:47 -0400 Subject: svcrpc: take advantage of tcp autotuning Allow the NFSv4 server to make use of TCP autotuning behaviour, which was previously disabled by setting the sk_userlocks variable. Set the receive buffers to be big enough to receive the whole RPC request, and set this for the listening socket, not the accept socket. Remove the code that readjusts the receive/send buffer sizes for the accepted socket. Previously this code was used to influence the TCP window management behaviour, which is no longer needed when autotuning is enabled. This can improve IO bandwidth on networks with high bandwidth-delay products, where a large tcp window is required. It also simplifies performance tuning, since getting adequate tcp buffers previously required increasing the number of nfsd threads. Signed-off-by: Olga Kornievskaia Cc: Jim Rees Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5763e6460fea..7a2a90fb2e06 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -345,7 +345,6 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; release_sock(sock->sk); #endif } @@ -797,23 +796,6 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) - /* sndbuf needs to have room for one request - * per thread, otherwise we can stall even when the - * network isn't a bottleneck. - * - * We count all threads rather than threads in a - * particular pool, which provides an upper bound - * on the number of threads which will access the socket. - * - * rcvbuf just needs to be able to hold a few requests. - * Normally they will be removed from the queue - * as soon a a complete request arrives. - */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - 3 * serv->sv_max_mesg); - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get @@ -1061,15 +1043,6 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; - /* initialise setting must have enough space to - * receive and respond to one request. - * svc_tcp_recvfrom will re-adjust if necessary - */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - - set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1140,8 +1113,14 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else + else { + /* initialise setting must have enough space to + * receive and respond to one request. + */ + svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, + 4 * serv->sv_max_mesg); svc_tcp_init(svsk, serv); + } /* * We start one listener per sv_serv. We want AF_INET -- cgit v1.2.3-59-g8ed1b From 0953e620de0538cbd081f1b45126f6098112a598 Mon Sep 17 00:00:00 2001 From: "Sachin S. Prabhu" Date: Mon, 23 Feb 2009 16:22:03 +0000 Subject: Inconsistent setattr behaviour There is an inconsistency seen in the behaviour of nfs compared to other local filesystems on linux when changing owner or group of a directory. If the directory has SUID/SGID flags set, on changing owner or group on the directory, the flags are stripped off on nfs. These flags are maintained on other filesystems such as ext3. To reproduce on a nfs share or local filesystem, run the following commands mkdir test; chmod +s+g test; chown user1 test; ls -ld test On the nfs share, the flags are stripped and the output seen is drwxr-xr-x 2 user1 root 4096 Feb 23 2009 test On other local filesystems(ex: ext3), the flags are not stripped and the output seen is drwsr-sr-x 2 user1 root 4096 Feb 23 13:57 test chown_common() called from sys_chown() will only strip the flags if the inode is not a directory. static int chown_common(struct dentry * dentry, uid_t user, gid_t group) { .. if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; .. } See: http://www.opengroup.org/onlinepubs/7990989775/xsh/chown.html "If the path argument refers to a regular file, the set-user-ID (S_ISUID) and set-group-ID (S_ISGID) bits of the file mode are cleared upon successful return from chown(), unless the call is made by a process with appropriate privileges, in which case it is implementation-dependent whether these bits are altered. If chown() is successfully invoked on a file that is not a regular file, these bits may be cleared. These bits are defined in ." The behaviour as it stands does not appear to violate POSIX. However the actions performed are inconsistent when comparing ext3 and nfs. Signed-off-by: Sachin Prabhu Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 54404d730809..8790571b30fd 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } /* Revoke setuid/setgid on chown */ - if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { + if (!S_ISDIR(inode->i_mode) && + (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ -- cgit v1.2.3-59-g8ed1b From 2795e53b4ed5d1f49d2283f416c922f55ec7d461 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 12 Mar 2009 12:07:14 -0400 Subject: SUNRPC: Clean up static inline functions in svc_xprt.h Clean up: Enable the use of const arguments in higher level svc_ APIs by adding const to the arguments of the helper functions in svc_xprt.h Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 46 +++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0127daca4354..959b931b6053 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -88,29 +88,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt) kref_get(&xprt->xpt_ref); } static inline void svc_xprt_set_local(struct svc_xprt *xprt, - struct sockaddr *sa, int salen) + const struct sockaddr *sa, + const size_t salen) { memcpy(&xprt->xpt_local, sa, salen); xprt->xpt_locallen = salen; } static inline void svc_xprt_set_remote(struct svc_xprt *xprt, - struct sockaddr *sa, int salen) + const struct sockaddr *sa, + const size_t salen) { memcpy(&xprt->xpt_remote, sa, salen); xprt->xpt_remotelen = salen; } -static inline unsigned short svc_addr_port(struct sockaddr *sa) +static inline unsigned short svc_addr_port(const struct sockaddr *sa) { - unsigned short ret = 0; + const struct sockaddr_in *sin = (const struct sockaddr_in *)sa; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa; + switch (sa->sa_family) { case AF_INET: - ret = ntohs(((struct sockaddr_in *)sa)->sin_port); - break; + return ntohs(sin->sin_port); case AF_INET6: - ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); - break; + return ntohs(sin6->sin6_port); } - return ret; + + return 0; } static inline size_t svc_addr_len(struct sockaddr *sa) @@ -124,36 +127,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa) return -EAFNOSUPPORT; } -static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) +static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt) { - return svc_addr_port((struct sockaddr *)&xprt->xpt_local); + return svc_addr_port((const struct sockaddr *)&xprt->xpt_local); } -static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) +static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt) { - return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); + return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote); } -static inline char *__svc_print_addr(struct sockaddr *addr, - char *buf, size_t len) +static inline char *__svc_print_addr(const struct sockaddr *addr, + char *buf, const size_t len) { + const struct sockaddr_in *sin = (const struct sockaddr_in *)addr; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr; + switch (addr->sa_family) { case AF_INET: - snprintf(buf, len, "%pI4, port=%u", - &((struct sockaddr_in *)addr)->sin_addr, - ntohs(((struct sockaddr_in *) addr)->sin_port)); + snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr, + ntohs(sin->sin_port)); break; case AF_INET6: snprintf(buf, len, "%pI6, port=%u", - &((struct sockaddr_in6 *)addr)->sin6_addr, - ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); + &sin6->sin6_addr, + ntohs(sin6->sin6_port)); break; default: snprintf(buf, len, "unknown address type: %d", addr->sa_family); break; } + return buf; } #endif /* SUNRPC_SVC_XPRT_H */ -- cgit v1.2.3-59-g8ed1b From e9d9df44736d116726f4596f7e2f9ce2764ffc0a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 18 Mar 2009 16:42:57 +0800 Subject: ftrace: protect running nmi (V3) When I review the sensitive code ftrace_nmi_enter(), I found the atomic variable nmi_running does protect NMI VS do_ftrace_mod_code(), but it can not protects NMI(entered nmi) VS NMI(ftrace_nmi_enter()). cpu#1 | cpu#2 | cpu#3 ftrace_nmi_enter() | do_ftrace_mod_code() | not modify | | ------------------------|-----------------------|-- executing | set mod_code_write = 1| executing --|-----------------------|-------------------- executing | | ftrace_nmi_enter() executing | | do modify ------------------------|-----------------------|----------------- ftrace_nmi_exit() | | cpu#3 may be being modified the code which is still being executed on cpu#1, it will have undefined results and possibly take a GPF, this patch prevents it occurred. Signed-off-by: Lai Jiangshan LKML-Reference: <49C0B411.30003@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 63 ++++++++++++++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d0d7f42efe3..57b33edb7ce3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -79,11 +79,11 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * * 1) Put the instruction pointer into the IP buffer * and the new code into the "code" buffer. - * 2) Set a flag that says we are modifying code - * 3) Wait for any running NMIs to finish. - * 4) Write the code - * 5) clear the flag. - * 6) Wait for any running NMIs to finish. + * 2) Wait for any running NMIs to finish and set a flag that says + * we are modifying code, it is done in an atomic operation. + * 3) Write the code + * 4) clear the flag. + * 5) Wait for any running NMIs to finish. * * If an NMI is executed, the first thing it does is to call * "ftrace_nmi_enter". This will check if the flag is set to write @@ -95,9 +95,9 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * are the same as what exists. */ +#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */ static atomic_t nmi_running = ATOMIC_INIT(0); static int mod_code_status; /* holds return value of text write */ -static int mod_code_write; /* set when NMI should do the write */ static void *mod_code_ip; /* holds the IP to write to */ static void *mod_code_newcode; /* holds the text to write to the IP */ @@ -114,6 +114,20 @@ int ftrace_arch_read_dyn_info(char *buf, int size) return r; } +static void clear_mod_flag(void) +{ + int old = atomic_read(&nmi_running); + + for (;;) { + int new = old & ~MOD_CODE_WRITE_FLAG; + + if (old == new) + break; + + old = atomic_cmpxchg(&nmi_running, old, new); + } +} + static void ftrace_mod_code(void) { /* @@ -127,27 +141,39 @@ static void ftrace_mod_code(void) /* if we fail, then kill any new writers */ if (mod_code_status) - mod_code_write = 0; + clear_mod_flag(); } void ftrace_nmi_enter(void) { - atomic_inc(&nmi_running); - /* Must have nmi_running seen before reading write flag */ - smp_mb(); - if (mod_code_write) { + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { + smp_rmb(); ftrace_mod_code(); atomic_inc(&nmi_update_count); } + /* Must have previous changes seen before executions */ + smp_mb(); } void ftrace_nmi_exit(void) { /* Finish all executions before clearing nmi_running */ - smp_wmb(); + smp_mb(); atomic_dec(&nmi_running); } +static void wait_for_nmi_and_set_mod_flag(void) +{ + if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)) + return; + + do { + cpu_relax(); + } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG)); + + nmi_wait_count++; +} + static void wait_for_nmi(void) { if (!atomic_read(&nmi_running)) @@ -167,14 +193,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) mod_code_newcode = new_code; /* The buffers need to be visible before we let NMIs write them */ - smp_wmb(); - - mod_code_write = 1; - - /* Make sure write bit is visible before we wait on NMIs */ smp_mb(); - wait_for_nmi(); + wait_for_nmi_and_set_mod_flag(); /* Make sure all running NMIs have finished before we write the code */ smp_mb(); @@ -182,13 +203,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) ftrace_mod_code(); /* Make sure the write happens before clearing the bit */ - smp_wmb(); - - mod_code_write = 0; - - /* make sure NMIs see the cleared bit */ smp_mb(); + clear_mod_flag(); wait_for_nmi(); return mod_code_status; -- cgit v1.2.3-59-g8ed1b From df7c8e845e8e2030e8ae947e0ace56d184d0e9a0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Mar 2009 15:22:20 +1030 Subject: cpumask: remove cpumask allocation from idle_balance Impact: fix circular locking Steven reports a circular locking from alloc_cpumask_var doing a wakeup. We get rid of this using the tried-and-true technique of using a per-cpu cpumask_var_t rather than doing an alloc every time. Simpler and more robust than a rare, implicit allocation within an atomic codepath. Reported-by: Steven Rostedt Signed-off-by: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5dabd80c3c15..48862d418bed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3448,19 +3448,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 +/* Working cpumask for load_balance and load_balance_newidle. */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance, struct cpumask *cpus) + int *balance) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3615,8 +3619,7 @@ out: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, - struct cpumask *cpus) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) { struct sched_group *group; struct rq *busiest = NULL; @@ -3624,6 +3627,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); cpumask_setall(cpus); @@ -3764,10 +3768,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; - cpumask_var_t tmpmask; - - if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC)) - return; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3778,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, this_rq, - sd, tmpmask); + sd); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3793,7 +3793,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ this_rq->next_balance = next_balance; } - free_cpumask_var(tmpmask); } /* @@ -3943,11 +3942,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; - cpumask_var_t tmp; - - /* Fails alloc? Rebalancing probably not a priority right now. */ - if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) - return; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3972,7 +3966,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -4006,8 +4000,6 @@ out: */ if (likely(update_next_balance)) rq->next_balance = next_balance; - - free_cpumask_var(tmp); } /* @@ -8303,6 +8295,9 @@ void __init sched_init(void) #endif #ifdef CONFIG_USER_SCHED alloc_size *= 2; +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size *= num_possible_cpus() * cpumask_size(); #endif /* * As sched_init() is called before page_alloc is setup, @@ -8341,6 +8336,12 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_tmpmask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ } #ifdef CONFIG_SMP -- cgit v1.2.3-59-g8ed1b From 17ad6ea621b1c7952ebd7330ce65de26b6ee9cca Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 19 Mar 2009 22:03:22 +1100 Subject: numa, cpumask: move numa_node_id default implementation to topology.h, fix Impact: build fix for powerpc and sparc Today's linux-next build (powerpc allyesconfig) failed like this: > In file included from include/linux/mmzone.h:776, > from include/linux/gfp.h:5, > from include/linux/kmod.h:23, > from include/linux/module.h:14, > from init/version.c:11: > arch/powerpc/include/asm/mmzone.h:32: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'numa_cpumask_lookup_table' Caused by commit 082edb7bf443eb8eda15b482d16ad9dd8137ad24 ("numa, cpumask: move numa_node_id default implementation to topology.h") from the cpus4096 tree which removed the include of linux/topology.h from linux/mmzone.h. Same for sparc64 defconfig. Signed-off-by: Stephen Rothwell Acked-b: Rusty Russell Cc: ppc-dev LKML-Reference: <20090319220322.3baa4613.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmzone.h | 1 + arch/sparc/include/asm/mmzone.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 19f299b7e256..35acac90c8ca 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -8,6 +8,7 @@ #define _ASM_MMZONE_H_ #ifdef __KERNEL__ +#include /* * generic non-linear memory support: diff --git a/arch/sparc/include/asm/mmzone.h b/arch/sparc/include/asm/mmzone.h index ebf5986c12ed..e8c648741ed4 100644 --- a/arch/sparc/include/asm/mmzone.h +++ b/arch/sparc/include/asm/mmzone.h @@ -3,6 +3,8 @@ #ifdef CONFIG_NEED_MULTIPLE_NODES +#include + extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) -- cgit v1.2.3-59-g8ed1b From 4a44bac1f98223ed77e47bf3b42fcfd10cddd85f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 19 Mar 2009 13:21:44 +0100 Subject: symbols, stacktrace: look up init symbols after module symbols Impact: fix incomplete stacktraces I noticed such weird stacktrace entries in lockdep dumps: [ 0.285956] {HARDIRQ-ON-W} state was registered at: [ 0.285956] [] mark_irqflags+0xbe/0x125 [ 0.285956] [] __lock_acquire+0x674/0x82d [ 0.285956] [] lock_acquire+0xfc/0x128 [ 0.285956] [] rt_spin_lock+0xc8/0xd0 [ 0.285956] [] 0xffffffffffffffff The stacktrace entry is cut off after rt_spin_lock. After much debugging i found out that stacktrace entries that belong to init symbols dont get printed out, due to commit: a2da405: module: Don't report discarded init pages as kernel text. The reason is this check added to core_kernel_text(): - if (addr >= (unsigned long)_sinittext && + if (system_state == SYSTEM_BOOTING && + addr >= (unsigned long)_sinittext && addr <= (unsigned long)_einittext) return 1; This will discard inittext symbols even though their symbol table is still present and even though stacktraces done while the system was booting up might still be relevant. To not reintroduce the (not well-specified) bug addressed in that commit, first do a module symbols lookup, then a final init-symbols lookup. This will work fine on architectures that have separate address spaces for modules (such as x86) - and should not crash any other architectures either. Acked-by: Peter Zijlstra Cc: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/extable.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index e136ed8d82ba..c46da6a47036 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } +static inline int init_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext && + addr <= (unsigned long)_einittext) + return 1; + return 0; +} + __notrace_funcgraph int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && @@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr) return 1; if (system_state == SYSTEM_BOOTING && - addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + init_kernel_text(addr)) return 1; return 0; } @@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; - return __module_text_address(addr) != NULL; + if (__module_text_address(addr)) + return 1; + /* + * There might be init symbols in saved stacktraces. + * Give those symbols a chance to be printed in + * backtraces (such as lockdep traces). + * + * Since we are after the module-symbols check, there's + * no danger of address overlap: + */ + if (init_kernel_text(addr)) + return 1; + return 0; } int kernel_text_address(unsigned long addr) -- cgit v1.2.3-59-g8ed1b From 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 19 Mar 2009 15:22:20 +1030 Subject: cpumask: remove cpumask allocation from idle_balance, fix Impact: fix boot crash Fix typo in the size calculation. Reported-by: Ingo Molnar Signed-off-by: Rusty Russell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 48862d418bed..11dd52780adb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8297,7 +8297,7 @@ void __init sched_init(void) alloc_size *= 2; #endif #ifdef CONFIG_CPUMASK_OFFSTACK - alloc_size *= num_possible_cpus() * cpumask_size(); + alloc_size += num_possible_cpus() * cpumask_size(); #endif /* * As sched_init() is called before page_alloc is setup, -- cgit v1.2.3-59-g8ed1b From ac5f6c96859e9a664ac05b04bc96ed1caad5fe29 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2009 11:29:23 -0400 Subject: function-graph: consolidate prologues for output Impact: clean up The prologue of the function graph entry, return and comments all start out pretty much the same. Each of these duplicate code and do so slightly differently. This patch consolidates the printing of the pid, absolute time, cpu and proc (and for entry, the interrupt). Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 98 +++++++++++------------------------- 1 file changed, 29 insertions(+), 69 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6004ccac2dd7..2d4d185cd0d8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -554,24 +554,24 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, } static enum print_line_t -print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, - struct trace_iterator *iter) +print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, + int type, unsigned long addr) { - int ret; - int cpu = iter->cpu; - pid_t *last_entry = iter->private; struct trace_entry *ent = iter->ent; - struct ftrace_graph_ent *call = &field->graph_ent; - struct ftrace_graph_ret_entry *leaf_ret; + pid_t *last_pid = iter->private; + int cpu = iter->cpu; + int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; - /* Interrupt */ - ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + if (type) { + /* Interrupt */ + ret = print_graph_irq(iter, addr, type, cpu, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } /* Absolute time */ if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { @@ -598,6 +598,20 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } + return 0; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter) +{ + int cpu = iter->cpu; + struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; + + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) + return TRACE_TYPE_PARTIAL_LINE; + leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) return print_graph_entry_leaf(iter, field, leaf_ret, s); @@ -613,38 +627,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int i; int ret; int cpu = iter->cpu; - pid_t *last_pid = iter->private, pid = ent->pid; + pid_t pid = ent->pid; unsigned long long duration = trace->rettime - trace->calltime; - /* Pid */ - if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) + if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; - /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - /* Overhead */ ret = print_graph_overhead(duration, s); if (!ret) @@ -689,38 +677,10 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, { int i; int ret; - int cpu = iter->cpu; - pid_t *last_pid = iter->private; - /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) + if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; - /* Absolute time */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Cpu */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } - - /* Proc */ - if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_printf(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - /* No overhead */ ret = print_graph_overhead(-1, s); if (!ret) -- cgit v1.2.3-59-g8ed1b From 3bf832ce1fe6988148d392599f34ca0c6a34427d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 19 Mar 2009 14:47:33 +0100 Subject: tracing/ring-buffer: fix non cpu hotplug case Impact: fix warning with irqsoff tracer The ring buffer allocates its buffers on pre-smp time (early_initcall). It means that, at first, only the boot cpu buffer is allocated and the ring-buffer cpumask only has the boot cpu set (cpu_online_mask). Later, the secondary cpu will show up and the ring-buffer will be notified about this event: the appropriate buffer will be allocated and the cpumask will be updated. Unfortunately, if !CONFIG_CPU_HOTPLUG, the ring-buffer will not be notified about the secondary cpus, meaning that the cpumask will have only the cpu boot set, and only one cpu buffer allocated. We fix that by using cpu_possible_mask if !CONFIG_CPU_HOTPLUG. This patch fixes the following warning with irqsoff tracer running: [ 169.317794] WARNING: at kernel/trace/trace.c:466 update_max_tr_single+0xcc/0xf3() [ 169.318002] Hardware name: AMILO Li 2727 [ 169.318002] Modules linked in: [ 169.318002] Pid: 5624, comm: bash Not tainted 2.6.29-rc8-tip-02636-g6aafa6c #11 [ 169.318002] Call Trace: [ 169.318002] [] warn_slowpath+0xea/0x13d [ 169.318002] [] ? ftrace_call+0x5/0x2b [ 169.318002] [] ? ftrace_call+0x5/0x2b [ 169.318002] [] ? ftrace_call+0x0/0x2b [ 169.318002] [] ? ftrace_modify_code+0xa9/0x108 [ 169.318002] [] ? trace_hardirqs_off+0x25/0x27 [ 169.318002] [] ? _spin_unlock_irqrestore+0x1f/0x2d [ 169.318002] [] ? ring_buffer_reset_cpu+0xf6/0xfb [ 169.318002] [] ? ring_buffer_reset+0x36/0x48 [ 169.318002] [] update_max_tr_single+0xcc/0xf3 [ 169.318002] [] ? sysret_check+0x22/0x5d [ 169.318002] [] stop_critical_timing+0x142/0x204 [ 169.318002] [] trace_hardirqs_on_caller+0x23/0x25 [ 169.318002] [] trace_hardirqs_on_thunk+0x3a/0x3c [ 169.318002] [] ? sysret_check+0x22/0x5d [ 169.318002] ---[ end trace db76cbf775a750cf ]--- Because this tracer may try to swap two cpu ring buffers for an unregistered cpu on the ring buffer. This patch might also fix a fair loss of traces due to unallocated buffers for secondary cpus. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Acked-b: Steven Rostedt LKML-Reference: <1237470453-5427-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bbf51922a8ca..384ca5d9d729 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -577,8 +577,17 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) if (buffer->pages == 1) buffer->pages++; + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU get_online_cpus(); cpumask_copy(buffer->cpumask, cpu_online_mask); +#else + cpumask_copy(buffer->cpumask, cpu_possible_mask); +#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; -- cgit v1.2.3-59-g8ed1b From 5ef841f6f32dce0b752a4fa0622781ee67a0e874 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2009 12:20:38 -0400 Subject: tracing: make print_(b)printk_msg_only global This patch makes print_printk_msg_only and print_bprintk_msg_only global for other functions to use. It also renames them by adding a "trace_" to the beginning to avoid namespace collisions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 36 ++---------------------------------- kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++ kernel/trace/trace_output.h | 5 +++++ 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d981ababc45..c637cb687cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1694,38 +1694,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return TRACE_TYPE_HANDLED; } -static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct bprint_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t print_printk_msg_only(struct trace_iterator *iter) -{ - struct trace_seq *s = &iter->seq; - struct trace_entry *entry = iter->ent; - struct print_entry *field; - int ret; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%s", field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1787,12 +1755,12 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter) if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return print_bprintk_msg_only(iter); + return trace_print_bprintk_msg_only(iter); if (iter->ent->type == TRACE_PRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) - return print_printk_msg_only(iter); + return trace_print_printk_msg_only(iter); if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 6a4c9dea191e..b45141748af5 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,38 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_bprintf(s, field->fmt, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%s", field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + /** * trace_seq_printf - sequence printing of trace information * @s: trace sequence descriptor diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 3b90e6ade1aa..35c422fb51a9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -15,6 +15,11 @@ struct trace_event { trace_print_func binary; }; +extern enum print_line_t +trace_print_bprintk_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_printk_msg_only(struct trace_iterator *iter); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int -- cgit v1.2.3-59-g8ed1b From 2fbcdb35aca614f9529a0e7d340146cf0b71684f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2009 13:24:42 -0400 Subject: function-graph: calculate function depth within function graph tracer Currently, the function graph tracer depends on the trace_printk to record the depth. All the information is already there in the trace to calculate function depth, with the exception of having the printk be the first item. But as soon as a entry or exit is reached, then we know the depth. This patch changes the iter->private data from recording a per cpu last_pid, to a structure that holds both the last_pid and the current depth. This data is used to determine the function depth for the printks. Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 91 +++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d4d185cd0d8..66ea23b64fe6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -14,6 +14,11 @@ #include "trace.h" #include "trace_output.h" +struct fgraph_data { + pid_t last_pid; + int depth; +}; + #define TRACE_GRAPH_INDENT 2 /* Flag options */ @@ -231,16 +236,16 @@ print_graph_proc(struct trace_seq *s, pid_t pid) /* If the pid changed since the last trace, output this event */ static enum print_line_t -verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu) +verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) { pid_t prev_pid; pid_t *last_pid; int ret; - if (!last_pids_cpu) + if (!data) return TRACE_TYPE_HANDLED; - last_pid = per_cpu_ptr(last_pids_cpu, cpu); + last_pid = &(per_cpu_ptr(data, cpu)->last_pid); if (*last_pid == pid) return TRACE_TYPE_HANDLED; @@ -471,6 +476,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) { + struct fgraph_data *data = iter->private; struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; @@ -481,6 +487,18 @@ print_graph_entry_leaf(struct trace_iterator *iter, call = &entry->graph_ent; duration = graph_ret->rettime - graph_ret->calltime; + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); + + /* + * Comments display at + 1 to depth. Since + * this is a leaf function, keep the comments + * equal to this depth. + */ + *depth = call->depth - 1; + } + /* Overhead */ ret = print_graph_overhead(duration, s); if (!ret) @@ -512,12 +530,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, } static enum print_line_t -print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, - struct trace_seq *s, pid_t pid, int cpu) +print_graph_entry_nested(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, int cpu) { - int i; - int ret; struct ftrace_graph_ent *call = &entry->graph_ent; + struct fgraph_data *data = iter->private; + int ret; + int i; + + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); + + *depth = call->depth; + } /* No overhead */ ret = print_graph_overhead(-1, s); @@ -557,13 +584,13 @@ static enum print_line_t print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, int type, unsigned long addr) { + struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; - pid_t *last_pid = iter->private; int cpu = iter->cpu; int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE) + if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) return TRACE_TYPE_PARTIAL_LINE; if (type) { @@ -616,7 +643,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (leaf_ret) return print_graph_entry_leaf(iter, field, leaf_ret, s); else - return print_graph_entry_nested(field, s, iter->ent->pid, cpu); + return print_graph_entry_nested(iter, field, s, cpu); } @@ -624,11 +651,24 @@ static enum print_line_t print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter) { - int i; - int ret; - int cpu = iter->cpu; - pid_t pid = ent->pid; unsigned long long duration = trace->rettime - trace->calltime; + struct fgraph_data *data = iter->private; + pid_t pid = ent->pid; + int cpu = iter->cpu; + int ret; + int i; + + if (data) { + int cpu = iter->cpu; + int *depth = &(per_cpu_ptr(data, cpu)->depth); + + /* + * Comments display at + 1 to depth. This is the + * return from a function, we now want the comments + * to display at the same level of the bracket. + */ + *depth = trace->depth - 1; + } if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -675,8 +715,13 @@ static enum print_line_t print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter) { - int i; + struct fgraph_data *data = iter->private; + int depth = 0; int ret; + int i; + + if (data) + depth = per_cpu_ptr(data, iter->cpu)->depth; if (print_graph_prologue(iter, s, 0, 0)) return TRACE_TYPE_PARTIAL_LINE; @@ -694,8 +739,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, } /* Indentation */ - if (trace->depth > 0) - for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { + if (depth > 0) + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { ret = trace_seq_printf(s, " "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -780,19 +825,21 @@ static void print_graph_headers(struct seq_file *s) static void graph_trace_open(struct trace_iterator *iter) { - /* pid on the last trace processed */ - pid_t *last_pid = alloc_percpu(pid_t); + /* pid and depth on the last trace processed */ + struct fgraph_data *data = alloc_percpu(struct fgraph_data); int cpu; - if (!last_pid) + if (!data) pr_warning("function graph tracer: not enough memory\n"); else for_each_possible_cpu(cpu) { - pid_t *pid = per_cpu_ptr(last_pid, cpu); + pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data, cpu)->depth); *pid = -1; + *depth = 0; } - iter->private = last_pid; + iter->private = data; } static void graph_trace_close(struct trace_iterator *iter) -- cgit v1.2.3-59-g8ed1b From 40ce74f19c28077550646c76d96a075bf312e461 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2009 14:03:53 -0400 Subject: tracing: remove recording function depth from trace_printk The function depth in trace_printk was to facilitate the function graph output. Now that the function graph calculates the depth within the trace output, we no longer need to record the depth when the trace_printk is called. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace.h | 6 ++---- kernel/trace/trace_event_types.h | 2 -- kernel/trace/trace_mmiotrace.c | 2 +- kernel/trace/trace_printk.c | 8 ++++---- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c637cb687cf2..f7f359d45823 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1194,7 +1194,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) * trace_vbprintk - write binary msg to tracing buffer * */ -int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; @@ -1236,7 +1236,6 @@ int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; entry = ring_buffer_event_data(event); entry->ip = ip; - entry->depth = depth; entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); @@ -1254,7 +1253,7 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; @@ -1291,7 +1290,6 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args) goto out_unlock; entry = ring_buffer_event_data(event); entry->ip = ip; - entry->depth = depth; memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; @@ -3140,7 +3138,7 @@ static int mark_printk(const char *fmt, ...) int ret; va_list args; va_start(args, fmt); - ret = trace_vprintk(0, -1, fmt, args); + ret = trace_vprintk(0, fmt, args); va_end(args); return ret; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 38276d1638e3..7c9a0cbf5dca 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -123,7 +123,6 @@ struct userstack_entry { struct bprint_entry { struct trace_entry ent; unsigned long ip; - int depth; const char *fmt; u32 buf[]; }; @@ -131,7 +130,6 @@ struct bprint_entry { struct print_entry { struct trace_entry ent; unsigned long ip; - int depth; char buf[]; }; @@ -598,9 +596,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace, extern void *head_page(struct trace_array_cpu *data); extern long ns2usecs(cycle_t nsec); extern int -trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args); +trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int -trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); +trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 019915063fe6..fd78bee71dd7 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -105,7 +105,6 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD(char *, fmt, fmt) TRACE_FIELD_ZERO_CHAR(buf) ), @@ -115,7 +114,6 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, ip, ip) - TRACE_FIELD(unsigned int, depth, depth) TRACE_FIELD_ZERO_CHAR(buf) ), TP_RAW_FMT("%08lx (%d) fmt:%p %s") diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index f095916e477f..8e37fcddd8b4 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -359,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map) int mmio_trace_printk(const char *fmt, va_list args) { - return trace_vprintk(0, -1, fmt, args); + return trace_vprintk(0, fmt, args); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 486785214e3e..eb81556107fe 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -112,7 +112,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); + ret = trace_vbprintk(ip, fmt, ap); va_end(ap); return ret; } @@ -126,7 +126,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; - return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap); + return trace_vbprintk(ip, fmt, ap); } EXPORT_SYMBOL_GPL(__ftrace_vbprintk); @@ -139,7 +139,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...) return 0; va_start(ap, fmt); - ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + ret = trace_vprintk(ip, fmt, ap); va_end(ap); return ret; } @@ -150,7 +150,7 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) if (!(trace_flags & TRACE_ITER_PRINTK)) return 0; - return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap); + return trace_vprintk(ip, fmt, ap); } EXPORT_SYMBOL_GPL(__ftrace_vprintk); -- cgit v1.2.3-59-g8ed1b From 5087f8d2a2f2daff5a913d72d8ea3ad601948e10 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 19 Mar 2009 15:14:46 -0400 Subject: function-graph: show binary events as comments With the added TRACE_EVENT macro, the events no longer appear in the function graph tracer. This was because the function graph did not know how to display the entries. The graph tracer was only aware of its own entries and the printk entries. By using the event call back feature, the graph tracer can now display the events. # echo irq > /debug/tracing/set_event Which can show: 0) | handle_IRQ_event() { 0) | /* irq_handler_entry: irq=48 handler=eth0 */ 0) | e1000_intr() { 0) 0.926 us | __napi_schedule(); 0) 3.888 us | } 0) | /* irq_handler_exit: irq=48 return=handled */ 0) 0.655 us | runqueue_is_locked(); 0) | __wake_up() { 0) 0.831 us | _spin_lock_irqsave(); The irq entry and exit events show up as comments. Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 40 +++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 66ea23b64fe6..e876816fa8e7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -712,10 +712,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } static enum print_line_t -print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, - struct trace_entry *ent, struct trace_iterator *iter) +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter) { + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct fgraph_data *data = iter->private; + struct trace_event *event; int depth = 0; int ret; int i; @@ -751,9 +753,26 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_bprintf(s, trace->fmt, trace->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + switch (iter->ent->type) { + case TRACE_BPRINT: + ret = trace_print_bprintk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_PRINT: + ret = trace_print_printk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + default: + event = ftrace_find_event(ent->type); + if (!event) + return TRACE_TYPE_UNHANDLED; + + ret = event->trace(iter, sym_flags); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } /* Strip ending newline */ if (s->buffer[s->len - 1] == '\n') { @@ -772,8 +791,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s, enum print_line_t print_graph_function(struct trace_iterator *iter) { - struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; switch (entry->type) { case TRACE_GRAPH_ENT: { @@ -786,14 +805,11 @@ print_graph_function(struct trace_iterator *iter) trace_assign_type(field, entry); return print_graph_return(&field->ret, s, entry, iter); } - case TRACE_BPRINT: { - struct bprint_entry *field; - trace_assign_type(field, entry); - return print_graph_comment(field, s, entry, iter); - } default: - return TRACE_TYPE_UNHANDLED; + return print_graph_comment(s, entry, iter); } + + return TRACE_TYPE_HANDLED; } static void print_graph_headers(struct seq_file *s) -- cgit v1.2.3-59-g8ed1b From 23725aeeab10ba02bcf10ec49ad73146b54cb52f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:13 +0100 Subject: ftrace: provide an id file for each event Since not every event has a format file to read the id from, expose it explicitly in a separate file. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <20090319194233.372534033@chello.nl> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c88227b3b9db..7763db8fd0b3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -412,6 +412,29 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, return r; } +static ssize_t +event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + trace_seq_printf(s, "%d\n", call->id); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, s->len); + kfree(s); + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -452,6 +475,11 @@ static const struct file_operations ftrace_event_format_fops = { .read = event_format_read, }; +static const struct file_operations ftrace_event_id_fops = { + .open = tracing_open_generic, + .read = event_id_read, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -550,6 +578,14 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) "'%s/enable' entry\n", call->name); } + if (call->id) { + entry = debugfs_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); + if (!entry) + pr_warning("Could not create debugfs '%s/id' entry\n", + call->name); + } + /* A trace may not want to export its format */ if (!call->show_format) return 0; -- cgit v1.2.3-59-g8ed1b From 28bea271e58e429eccfad3d7ee2ad12d6ee015bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:14 +0100 Subject: ftrace: ensure every event gets an id Impact: widen user-space visibe event IDs to all events Previously only TRACE_EVENT events got ids, because only they generated raw output which needs to be demuxed from the trace. In order to provide a unique ID for each event, register everybody, regardless. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <20090319194233.464914218@chello.nl> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_stage_3.h | 15 ++++++++++++++- kernel/trace/trace_output.c | 5 +++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index ae2e323df0c7..4c26d97b4508 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -130,7 +130,19 @@ static void ftrace_unreg_event_##call(void) \ { \ unregister_trace_##call(ftrace_event_##call); \ } \ - + \ +static struct ftrace_event_call event_##call; \ + \ +static int ftrace_init_event_##call(void) \ +{ \ + int id; \ + \ + id = register_ftrace_event(NULL); \ + if (!id) \ + return -ENODEV; \ + event_##call.id = id; \ + return 0; \ +} #undef TRACE_FORMAT #define TRACE_FORMAT(call, proto, args, fmt) \ @@ -140,6 +152,7 @@ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_init_event_##call, \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index b45141748af5..19261fdd2455 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -481,6 +481,11 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); + if (!event) { + ret = next_event_type++; + goto out; + } + if (!event->type) event->type = next_event_type++; else if (event->type > __TRACE_LAST_TYPE) { -- cgit v1.2.3-59-g8ed1b From ac199db0189c091f2863312061c0575937f68810 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:15 +0100 Subject: ftrace: event profile hooks Impact: new tracing infrastructure feature Provide infrastructure to generate software perf counter events from tracepoints. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt LKML-Reference: <20090319194233.557364871@chello.nl> Signed-off-by: Ingo Molnar --- kernel/trace/Makefile | 1 + kernel/trace/events.c | 1 - kernel/trace/trace.h | 11 ++++++++++ kernel/trace/trace_event_profile.c | 31 ++++++++++++++++++++++++++ kernel/trace/trace_events.c | 9 ++------ kernel/trace/trace_events_stage_3.h | 44 +++++++++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 8 deletions(-) create mode 100644 kernel/trace/trace_event_profile.c diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c3feea01c3e0..0e45c206c2f9 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o +obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o libftrace-y := ftrace.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 9fc918da404f..246f2aa6dc46 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -12,4 +12,3 @@ #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" -#include diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7c9a0cbf5dca..7cfb741be200 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -785,12 +785,23 @@ struct ftrace_event_call { int id; int (*raw_init)(void); int (*show_format)(struct trace_seq *s); + +#ifdef CONFIG_EVENT_PROFILE + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); +#endif }; void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c new file mode 100644 index 000000000000..22cba9970776 --- /dev/null +++ b/kernel/trace/trace_event_profile.c @@ -0,0 +1,31 @@ +/* + * trace event based perf counter profiling + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * + */ + +#include "trace.h" + +int ftrace_profile_enable(int event_id) +{ + struct ftrace_event_call *event; + + for_each_event(event) { + if (event->id == event_id) + return event->profile_enable(event); + } + + return -EINVAL; +} + +void ftrace_profile_disable(int event_id) +{ + struct ftrace_event_call *event; + + for_each_event(event) { + if (event->id == event_id) + return event->profile_disable(event); + } +} + diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7763db8fd0b3..3047b56f6637 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,11 +19,6 @@ static DEFINE_MUTEX(event_mutex); -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - static void ftrace_clear_events(void) { struct ftrace_event_call *call = (void *)__start_ftrace_events; @@ -90,7 +85,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - events_for_each(call) { + for_each_event(call) { if (!call->name || !call->regfunc) continue; @@ -628,7 +623,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - events_for_each(call) { + for_each_event(call) { /* The linker may leave blanks */ if (!call->name) continue; diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 4c26d97b4508..6b3261ca988c 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -109,6 +109,40 @@ #undef TP_FMT #define TP_FMT(fmt, args...) fmt "\n", ##args +#ifdef CONFIG_EVENT_PROFILE +#define _TRACE_PROFILE(call, proto, args) \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int); \ + perf_tpcounter_event(event_##call.id); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ +{ \ + if (atomic_add_negative(-1, &call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#define _TRACE_PROFILE_INIT(call) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = ftrace_profile_enable_##call, \ + .profile_disable = ftrace_profile_disable_##call, + +#else +#define _TRACE_PROFILE(call, proto, args) +#define _TRACE_PROFILE_INIT(call) +#endif + #define _TRACE_FORMAT(call, proto, args, fmt) \ static void ftrace_event_##call(proto) \ { \ @@ -147,6 +181,7 @@ static int ftrace_init_event_##call(void) \ #undef TRACE_FORMAT #define TRACE_FORMAT(call, proto, args, fmt) \ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ @@ -155,6 +190,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = ftrace_init_event_##call, \ .regfunc = ftrace_reg_event_##call, \ .unregfunc = ftrace_unreg_event_##call, \ + _TRACE_PROFILE_INIT(call) \ } #undef __entry @@ -162,6 +198,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -227,4 +264,11 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ + _TRACE_PROFILE_INIT(call) \ } + +#include + +#undef _TRACE_PROFILE +#undef _TRACE_PROFILE_INIT + -- cgit v1.2.3-59-g8ed1b From 70f454408e68fdba2c2529ab7d6ec3c3525e59f2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 20 Mar 2009 09:24:10 +0100 Subject: s390: remove arch specific smp_send_stop() Impact: build fix on s390 !CONFIG_SMP Remove arch specific smp_send_stop for !CONFIG_SMP since it conflicts with a new generic version. Reported-by: Stephen Rothwell Signed-off-by: Heiko Carstens Cc: Martin Schwidefsky LKML-Reference: <20090320092410.30d2bac3@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- arch/s390/include/asm/smp.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 024b91e06239..f89e2d5c060c 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -97,12 +97,6 @@ extern void arch_send_call_function_ipi(cpumask_t mask); #endif #ifndef CONFIG_SMP -static inline void smp_send_stop(void) -{ - /* Disable all interrupts/machine checks */ - __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK); -} - #define hard_smp_processor_id() 0 #define smp_cpu_not_running(cpu) 1 #endif -- cgit v1.2.3-59-g8ed1b From 505f2b970b2269ce4cb669b3ff4f6479d379cec2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 20 Mar 2009 11:05:04 +0100 Subject: tracing, Text Edit Lock - kprobes architecture independent support, nommu fix Impact: build fix on SH !CONFIG_MMU Stephen Rothwell reported this linux-next build failure on the SH architecture: kernel/built-in.o: In function `disable_all_kprobes': kernel/kprobes.c:1382: undefined reference to `text_mutex' [...] And observed: | Introduced by commit 4460fdad85becd569f11501ad5b91814814335ff ("tracing, | Text Edit Lock - kprobes architecture independent support") from the | tracing tree. text_mutex is defined in mm/memory.c which is only built | if CONFIG_MMU is defined, which is not true for sh allmodconfig. Move this lock to kernel/extable.c (which is already home to various kernel text related routines), which file is always built-in. Reported-by: Stephen Rothwell Cc: Paul Mundt Cc: Mathieu Desnoyers LKML-Reference: <20090320110602.86351a91.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- kernel/extable.c | 14 ++++++++++++-- mm/memory.c | 8 -------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/kernel/extable.c b/kernel/extable.c index 0df6253730be..25d39b0c3a1b 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -15,11 +15,21 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include +#include #include -#include -#include + #include +#include + +/* + * mutex protecting text section modification (dynamic code patching). + * some users need to sleep (allocating memory...) while they hold this lock. + * + * NOT exported to modules - patching kernel text is a really delicate matter. + */ +DEFINE_MUTEX(text_mutex); extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; diff --git a/mm/memory.c b/mm/memory.c index 05fab3bc5b4b..dfc9e4ea4e8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -101,14 +101,6 @@ int randomize_va_space __read_mostly = 2; #endif -/* - * mutex protecting text section modification (dynamic code patching). - * some users need to sleep (allocating memory...) while they hold this lock. - * - * NOT exported to modules - patching kernel text is a really delicate matter. - */ -DEFINE_MUTEX(text_mutex); - static int __init disable_randmaps(char *s) { randomize_va_space = 0; -- cgit v1.2.3-59-g8ed1b From 08d2503ecc2337275942c1f52822b7a464c0c0a3 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 4 Mar 2009 12:01:29 -0800 Subject: [MTD] [NAND] Blackfin NFC Driver: do not clobber DMAC1_PERIMUX Only set DMAC1_PERIMUX once we have requested and been granted the dma channel to prevent breaking other peripherals in the error case Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/bf5xx_nand.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/nand/bf5xx_nand.c b/drivers/mtd/nand/bf5xx_nand.c index 9af2a2cc1153..5c6056e02426 100644 --- a/drivers/mtd/nand/bf5xx_nand.c +++ b/drivers/mtd/nand/bf5xx_nand.c @@ -552,7 +552,6 @@ static void bf5xx_nand_dma_write_buf(struct mtd_info *mtd, static int bf5xx_nand_dma_init(struct bf5xx_nand_info *info) { int ret; - unsigned short val; /* Do not use dma */ if (!hardware_ecc) @@ -560,13 +559,6 @@ static int bf5xx_nand_dma_init(struct bf5xx_nand_info *info) init_completion(&info->dma_completion); -#ifdef CONFIG_BF54x - /* Setup DMAC1 channel mux for NFC which shared with SDH */ - val = bfin_read_DMAC1_PERIMUX(); - val &= 0xFFFE; - bfin_write_DMAC1_PERIMUX(val); - SSYNC(); -#endif /* Request NFC DMA channel */ ret = request_dma(CH_NFC, "BF5XX NFC driver"); if (ret < 0) { @@ -574,6 +566,12 @@ static int bf5xx_nand_dma_init(struct bf5xx_nand_info *info) return ret; } +#ifdef CONFIG_BF54x + /* Setup DMAC1 channel mux for NFC which shared with SDH */ + bfin_write_DMAC1_PERIMUX(bfin_read_DMAC1_PERIMUX() & ~1); + SSYNC(); +#endif + set_dma_callback(CH_NFC, (void *) bf5xx_nand_dma_irq, (void *) info); /* Turn off the DMA channel first */ -- cgit v1.2.3-59-g8ed1b From 8d30cab06920f9efb7e089f2c69e451b2a637d8a Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 4 Mar 2009 12:01:29 -0800 Subject: [MTD] [NAND] Blackfin NFC Driver: mark bf5xx_nand_add_partition() as __devinit The bf5xx_nand_add_partition() func is only called by __devinit functions, so put it into the __devinit section as well Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/bf5xx_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/bf5xx_nand.c b/drivers/mtd/nand/bf5xx_nand.c index 5c6056e02426..520314d4a805 100644 --- a/drivers/mtd/nand/bf5xx_nand.c +++ b/drivers/mtd/nand/bf5xx_nand.c @@ -630,7 +630,7 @@ static int bf5xx_nand_hw_init(struct bf5xx_nand_info *info) /* * Device management interface */ -static int bf5xx_nand_add_partition(struct bf5xx_nand_info *info) +static int __devinit bf5xx_nand_add_partition(struct bf5xx_nand_info *info) { struct mtd_info *mtd = &info->mtd; -- cgit v1.2.3-59-g8ed1b From bfc492571e7ad1eb4b9fe3ac8ab0a7cdaecf8eca Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 4 Mar 2009 12:01:30 -0800 Subject: [MTD] [NAND] Blackfin NFC Driver: drop pointless casts with set_dma_callback() Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/bf5xx_nand.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/bf5xx_nand.c b/drivers/mtd/nand/bf5xx_nand.c index 520314d4a805..4c2a67ca801e 100644 --- a/drivers/mtd/nand/bf5xx_nand.c +++ b/drivers/mtd/nand/bf5xx_nand.c @@ -572,7 +572,7 @@ static int bf5xx_nand_dma_init(struct bf5xx_nand_info *info) SSYNC(); #endif - set_dma_callback(CH_NFC, (void *) bf5xx_nand_dma_irq, (void *) info); + set_dma_callback(CH_NFC, bf5xx_nand_dma_irq, info); /* Turn off the DMA channel first */ disable_dma(CH_NFC); -- cgit v1.2.3-59-g8ed1b From c6d59cdd412e1ae34ad9c8dc69eaabada792f7ae Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 4 Mar 2009 12:01:35 -0800 Subject: [JFFS2] kmem_cache_alloc/memset -> kmem_cache_zalloc Used kmem_cache_zalloc instead of kmem_cache_alloc/memset. Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- fs/jffs2/malloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index f9211252b5f1..9eff2bdae8a7 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x) struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) { struct jffs2_xattr_datum *xd; - xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); + xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); - memset(xd, 0, sizeof(struct jffs2_xattr_datum)); xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; INIT_LIST_HEAD(&xd->xindex); @@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd) struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) { struct jffs2_xattr_ref *ref; - ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); + ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); - memset(ref, 0, sizeof(struct jffs2_xattr_ref)); ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; return ref; -- cgit v1.2.3-59-g8ed1b From 52ff49df7fab18e56fa31b43143c4150c2547836 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 12:01:36 -0800 Subject: [MTD] [NAND] fix "raw" reads with ECC syndrome layouts The syndrome based page read/write routines store ECC, and possibly other "OOB" data, right after each chunk of ECC'd data. With ECC chunk size of 512 bytes and a large page (2KiB) NAND, the layout is: data-0 OOB-0 data-1 OOB-1 data-2 OOB-2 data-3 OOB-3 OOB-leftover Where OOBx is (prepad, ECC, postpad). However, the current "raw" routines use a traditional layout -- data OOB, disregarding the prepad and postpad values -- so when they're used with that type of ECC hardware, those calls mix up the data and OOB. Which means, in particular, that bad block tables won't be found on startup, with data corruption and related chaos ensuing. The current syndrome-based drivers in mainline all seem to use one chunk per page; presumably they haven't noticed such bugs. Fix this, by adding read/write page_raw_syndrome() routines as siblings of the existing non-raw routines; "raw" just means to bypass the ECC computations, not change data and OOB layout. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_base.c | 99 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 0c3afccde8a2..d902370df3e0 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -748,6 +748,8 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip) * @mtd: mtd info structure * @chip: nand chip info structure * @buf: buffer to store read data + * + * Not for syndrome calculating ecc controllers, which use a special oob layout */ static int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip, uint8_t *buf) @@ -757,6 +759,47 @@ static int nand_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip, return 0; } +/** + * nand_read_page_raw_syndrome - [Intern] read raw page data without ecc + * @mtd: mtd info structure + * @chip: nand chip info structure + * @buf: buffer to store read data + * + * We need a special oob layout and handling even when OOB isn't used. + */ +static int nand_read_page_raw_syndrome(struct mtd_info *mtd, struct nand_chip *chip, + uint8_t *buf) +{ + int eccsize = chip->ecc.size; + int eccbytes = chip->ecc.bytes; + uint8_t *oob = chip->oob_poi; + int steps, size; + + for (steps = chip->ecc.steps; steps > 0; steps--) { + chip->read_buf(mtd, buf, eccsize); + buf += eccsize; + + if (chip->ecc.prepad) { + chip->read_buf(mtd, oob, chip->ecc.prepad); + oob += chip->ecc.prepad; + } + + chip->read_buf(mtd, oob, eccbytes); + oob += eccbytes; + + if (chip->ecc.postpad) { + chip->read_buf(mtd, oob, chip->ecc.postpad); + oob += chip->ecc.postpad; + } + } + + size = mtd->oobsize - (oob - chip->oob_poi); + if (size) + chip->read_buf(mtd, oob, size); + + return 0; +} + /** * nand_read_page_swecc - [REPLACABLE] software ecc based page read function * @mtd: mtd info structure @@ -1482,6 +1525,8 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, * @mtd: mtd info structure * @chip: nand chip info structure * @buf: data buffer + * + * Not for syndrome calculating ecc controllers, which use a special oob layout */ static void nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip, const uint8_t *buf) @@ -1490,6 +1535,44 @@ static void nand_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip, chip->write_buf(mtd, chip->oob_poi, mtd->oobsize); } +/** + * nand_write_page_raw_syndrome - [Intern] raw page write function + * @mtd: mtd info structure + * @chip: nand chip info structure + * @buf: data buffer + * + * We need a special oob layout and handling even when ECC isn't checked. + */ +static void nand_write_page_raw_syndrome(struct mtd_info *mtd, struct nand_chip *chip, + const uint8_t *buf) +{ + int eccsize = chip->ecc.size; + int eccbytes = chip->ecc.bytes; + uint8_t *oob = chip->oob_poi; + int steps, size; + + for (steps = chip->ecc.steps; steps > 0; steps--) { + chip->write_buf(mtd, buf, eccsize); + buf += eccsize; + + if (chip->ecc.prepad) { + chip->write_buf(mtd, oob, chip->ecc.prepad); + oob += chip->ecc.prepad; + } + + chip->read_buf(mtd, oob, eccbytes); + oob += eccbytes; + + if (chip->ecc.postpad) { + chip->write_buf(mtd, oob, chip->ecc.postpad); + oob += chip->ecc.postpad; + } + } + + size = mtd->oobsize - (oob - chip->oob_poi); + if (size) + chip->write_buf(mtd, oob, size); +} /** * nand_write_page_swecc - [REPLACABLE] software ecc based page write function * @mtd: mtd info structure @@ -2569,10 +2652,6 @@ int nand_scan_tail(struct mtd_info *mtd) * check ECC mode, default to software if 3byte/512byte hardware ECC is * selected and we have 256 byte pagesize fallback to software ECC */ - if (!chip->ecc.read_page_raw) - chip->ecc.read_page_raw = nand_read_page_raw; - if (!chip->ecc.write_page_raw) - chip->ecc.write_page_raw = nand_write_page_raw; switch (chip->ecc.mode) { case NAND_ECC_HW: @@ -2581,6 +2660,10 @@ int nand_scan_tail(struct mtd_info *mtd) chip->ecc.read_page = nand_read_page_hwecc; if (!chip->ecc.write_page) chip->ecc.write_page = nand_write_page_hwecc; + if (!chip->ecc.read_page_raw) + chip->ecc.read_page_raw = nand_read_page_raw; + if (!chip->ecc.write_page_raw) + chip->ecc.write_page_raw = nand_write_page_raw; if (!chip->ecc.read_oob) chip->ecc.read_oob = nand_read_oob_std; if (!chip->ecc.write_oob) @@ -2602,6 +2685,10 @@ int nand_scan_tail(struct mtd_info *mtd) chip->ecc.read_page = nand_read_page_syndrome; if (!chip->ecc.write_page) chip->ecc.write_page = nand_write_page_syndrome; + if (!chip->ecc.read_page_raw) + chip->ecc.read_page_raw = nand_read_page_raw_syndrome; + if (!chip->ecc.write_page_raw) + chip->ecc.write_page_raw = nand_write_page_raw_syndrome; if (!chip->ecc.read_oob) chip->ecc.read_oob = nand_read_oob_syndrome; if (!chip->ecc.write_oob) @@ -2620,6 +2707,8 @@ int nand_scan_tail(struct mtd_info *mtd) chip->ecc.read_page = nand_read_page_swecc; chip->ecc.read_subpage = nand_read_subpage; chip->ecc.write_page = nand_write_page_swecc; + chip->ecc.read_page_raw = nand_read_page_raw; + chip->ecc.write_page_raw = nand_write_page_raw; chip->ecc.read_oob = nand_read_oob_std; chip->ecc.write_oob = nand_write_oob_std; chip->ecc.size = 256; @@ -2632,6 +2721,8 @@ int nand_scan_tail(struct mtd_info *mtd) chip->ecc.read_page = nand_read_page_raw; chip->ecc.write_page = nand_write_page_raw; chip->ecc.read_oob = nand_read_oob_std; + chip->ecc.read_page_raw = nand_read_page_raw; + chip->ecc.write_page_raw = nand_write_page_raw; chip->ecc.write_oob = nand_write_oob_std; chip->ecc.size = mtd->writesize; chip->ecc.bytes = 0; -- cgit v1.2.3-59-g8ed1b From ff4569c752c577c7e71e03c9d59e6ef85ca763c0 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 12:01:37 -0800 Subject: [MTD] [NAND] davinci_nand driver This is a device driver for the NAND flash controller found on the various DaVinci family chips. It handles up to four SoC chipselects, and some flavors of secondary chipselect (e.g. based on upper bits of the address bus) as used with some multichip packages. (Including the 2 GiB chips used on some TI devel boards.) The 1-bit ECC hardware is supported (3 bytes ECC per 512 bytes data); but not yet the newer 4-bit ECC (10 bytes ECC per 512 bytes data), as available on chips like the DM355 or OMAP-L137 and needed with the more error-prone MLC NAND chips. This is a cleaned-up version of code that's been in use for several years now; sanity checked with the new drivers/mtd/tests. Signed-off-by: David Brownell Signed-off-by: Sudhakar Rajashekhara Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- arch/arm/mach-davinci/include/mach/nand.h | 80 ++++ drivers/mtd/nand/Kconfig | 7 + drivers/mtd/nand/Makefile | 1 + drivers/mtd/nand/davinci_nand.c | 584 ++++++++++++++++++++++++++++++ 4 files changed, 672 insertions(+) create mode 100644 arch/arm/mach-davinci/include/mach/nand.h create mode 100644 drivers/mtd/nand/davinci_nand.c diff --git a/arch/arm/mach-davinci/include/mach/nand.h b/arch/arm/mach-davinci/include/mach/nand.h new file mode 100644 index 000000000000..aa482841270b --- /dev/null +++ b/arch/arm/mach-davinci/include/mach/nand.h @@ -0,0 +1,80 @@ +/* + * mach-davinci/nand.h + * + * Copyright © 2006 Texas Instruments. + * + * Ported to 2.6.23 Copyright © 2008 by + * Sander Huijsen + * Troy Kisky + * Dirk Behme + * + * -------------------------------------------------------------------------- + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __ARCH_ARM_DAVINCI_NAND_H +#define __ARCH_ARM_DAVINCI_NAND_H + +#include + +#define NRCSR_OFFSET 0x00 +#define AWCCR_OFFSET 0x04 +#define A1CR_OFFSET 0x10 +#define NANDFCR_OFFSET 0x60 +#define NANDFSR_OFFSET 0x64 +#define NANDF1ECC_OFFSET 0x70 + +/* 4-bit ECC syndrome registers */ +#define NAND_4BIT_ECC_LOAD_OFFSET 0xbc +#define NAND_4BIT_ECC1_OFFSET 0xc0 +#define NAND_4BIT_ECC2_OFFSET 0xc4 +#define NAND_4BIT_ECC3_OFFSET 0xc8 +#define NAND_4BIT_ECC4_OFFSET 0xcc +#define NAND_ERR_ADD1_OFFSET 0xd0 +#define NAND_ERR_ADD2_OFFSET 0xd4 +#define NAND_ERR_ERRVAL1_OFFSET 0xd8 +#define NAND_ERR_ERRVAL2_OFFSET 0xdc + +/* NOTE: boards don't need to use these address bits + * for ALE/CLE unless they support booting from NAND. + * They're used unless platform data overrides them. + */ +#define MASK_ALE 0x08 +#define MASK_CLE 0x10 + +struct davinci_nand_pdata { /* platform_data */ + uint32_t mask_ale; + uint32_t mask_cle; + + /* for packages using two chipselects */ + uint32_t mask_chipsel; + + /* board's default static partition info */ + struct mtd_partition *parts; + unsigned nr_parts; + + /* none == NAND_ECC_NONE (strongly *not* advised!!) + * soft == NAND_ECC_SOFT + * 1-bit == NAND_ECC_HW + * 4-bit == NAND_ECC_HW_SYNDROME (not on all chips) + */ + nand_ecc_modes_t ecc_mode; + + /* e.g. NAND_BUSWIDTH_16 or NAND_USE_FLASH_BBT */ + unsigned options; +}; + +#endif /* __ARCH_ARM_DAVINCI_NAND_H */ diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 8b12e6e109d3..30d6fbc30c35 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -427,4 +427,11 @@ config MTD_NAND_SH_FLCTL Several Renesas SuperH CPU has FLCTL. This option enables support for NAND Flash using FLCTL. This driver support SH7723. +config MTD_NAND_DAVINCI + tristate "Support NAND on DaVinci SoC" + depends on ARCH_DAVINCI + help + Enable the driver for NAND flash chips on Texas Instruments + DaVinci processors. + endif # MTD_NAND diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile index b661586afbfc..26dc0bef68ae 100644 --- a/drivers/mtd/nand/Makefile +++ b/drivers/mtd/nand/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_MTD_NAND_AU1550) += au1550nd.o obj-$(CONFIG_MTD_NAND_BF5XX) += bf5xx_nand.o obj-$(CONFIG_MTD_NAND_PPCHAMELEONEVB) += ppchameleonevb.o obj-$(CONFIG_MTD_NAND_S3C2410) += s3c2410.o +obj-$(CONFIG_MTD_NAND_DAVINCI) += davinci_nand.o obj-$(CONFIG_MTD_NAND_DISKONCHIP) += diskonchip.o obj-$(CONFIG_MTD_NAND_H1900) += h1910.o obj-$(CONFIG_MTD_NAND_RTC_FROM4) += rtc_from4.o diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c new file mode 100644 index 000000000000..4bedd8dec61e --- /dev/null +++ b/drivers/mtd/nand/davinci_nand.c @@ -0,0 +1,584 @@ +/* + * davinci_nand.c - NAND Flash Driver for DaVinci family chips + * + * Copyright © 2006 Texas Instruments. + * + * Port to 2.6.23 Copyright © 2008 by: + * Sander Huijsen + * Troy Kisky + * Dirk Behme + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + + +#ifdef CONFIG_MTD_PARTITIONS +static inline int mtd_has_partitions(void) { return 1; } +#else +static inline int mtd_has_partitions(void) { return 0; } +#endif + +#ifdef CONFIG_MTD_CMDLINE_PARTS +static inline int mtd_has_cmdlinepart(void) { return 1; } +#else +static inline int mtd_has_cmdlinepart(void) { return 0; } +#endif + + +/* + * This is a device driver for the NAND flash controller found on the + * various DaVinci family chips. It handles up to four SoC chipselects, + * and some flavors of secondary chipselect (e.g. based on A12) as used + * with multichip packages. + * + * The 1-bit ECC hardware is supported, but not yet the newer 4-bit ECC + * available on chips like the DM355 and OMAP-L137 and needed with the + * more error-prone MLC NAND chips. + * + * This driver assumes EM_WAIT connects all the NAND devices' RDY/nBUSY + * outputs in a "wire-AND" configuration, with no per-chip signals. + */ +struct davinci_nand_info { + struct mtd_info mtd; + struct nand_chip chip; + + struct device *dev; + struct clk *clk; + bool partitioned; + + void __iomem *base; + void __iomem *vaddr; + + uint32_t ioaddr; + uint32_t current_cs; + + uint32_t mask_chipsel; + uint32_t mask_ale; + uint32_t mask_cle; + + uint32_t core_chipsel; +}; + +static DEFINE_SPINLOCK(davinci_nand_lock); + +#define to_davinci_nand(m) container_of(m, struct davinci_nand_info, mtd) + + +static inline unsigned int davinci_nand_readl(struct davinci_nand_info *info, + int offset) +{ + return __raw_readl(info->base + offset); +} + +static inline void davinci_nand_writel(struct davinci_nand_info *info, + int offset, unsigned long value) +{ + __raw_writel(value, info->base + offset); +} + +/*----------------------------------------------------------------------*/ + +/* + * Access to hardware control lines: ALE, CLE, secondary chipselect. + */ + +static void nand_davinci_hwcontrol(struct mtd_info *mtd, int cmd, + unsigned int ctrl) +{ + struct davinci_nand_info *info = to_davinci_nand(mtd); + uint32_t addr = info->current_cs; + struct nand_chip *nand = mtd->priv; + + /* Did the control lines change? */ + if (ctrl & NAND_CTRL_CHANGE) { + if ((ctrl & NAND_CTRL_CLE) == NAND_CTRL_CLE) + addr |= info->mask_cle; + else if ((ctrl & NAND_CTRL_ALE) == NAND_CTRL_ALE) + addr |= info->mask_ale; + + nand->IO_ADDR_W = (void __iomem __force *)addr; + } + + if (cmd != NAND_CMD_NONE) + iowrite8(cmd, nand->IO_ADDR_W); +} + +static void nand_davinci_select_chip(struct mtd_info *mtd, int chip) +{ + struct davinci_nand_info *info = to_davinci_nand(mtd); + uint32_t addr = info->ioaddr; + + /* maybe kick in a second chipselect */ + if (chip > 0) + addr |= info->mask_chipsel; + info->current_cs = addr; + + info->chip.IO_ADDR_W = (void __iomem __force *)addr; + info->chip.IO_ADDR_R = info->chip.IO_ADDR_W; +} + +/*----------------------------------------------------------------------*/ + +/* + * 1-bit hardware ECC ... context maintained for each core chipselect + */ + +static inline uint32_t nand_davinci_readecc_1bit(struct mtd_info *mtd) +{ + struct davinci_nand_info *info = to_davinci_nand(mtd); + + return davinci_nand_readl(info, NANDF1ECC_OFFSET + + 4 * info->core_chipsel); +} + +static void nand_davinci_hwctl_1bit(struct mtd_info *mtd, int mode) +{ + struct davinci_nand_info *info; + uint32_t nandcfr; + unsigned long flags; + + info = to_davinci_nand(mtd); + + /* Reset ECC hardware */ + nand_davinci_readecc_1bit(mtd); + + spin_lock_irqsave(&davinci_nand_lock, flags); + + /* Restart ECC hardware */ + nandcfr = davinci_nand_readl(info, NANDFCR_OFFSET); + nandcfr |= BIT(8 + info->core_chipsel); + davinci_nand_writel(info, NANDFCR_OFFSET, nandcfr); + + spin_unlock_irqrestore(&davinci_nand_lock, flags); +} + +/* + * Read hardware ECC value and pack into three bytes + */ +static int nand_davinci_calculate_1bit(struct mtd_info *mtd, + const u_char *dat, u_char *ecc_code) +{ + unsigned int ecc_val = nand_davinci_readecc_1bit(mtd); + unsigned int ecc24 = (ecc_val & 0x0fff) | ((ecc_val & 0x0fff0000) >> 4); + + /* invert so that erased block ecc is correct */ + ecc24 = ~ecc24; + ecc_code[0] = (u_char)(ecc24); + ecc_code[1] = (u_char)(ecc24 >> 8); + ecc_code[2] = (u_char)(ecc24 >> 16); + + return 0; +} + +static int nand_davinci_correct_1bit(struct mtd_info *mtd, u_char *dat, + u_char *read_ecc, u_char *calc_ecc) +{ + struct nand_chip *chip = mtd->priv; + uint32_t eccNand = read_ecc[0] | (read_ecc[1] << 8) | + (read_ecc[2] << 16); + uint32_t eccCalc = calc_ecc[0] | (calc_ecc[1] << 8) | + (calc_ecc[2] << 16); + uint32_t diff = eccCalc ^ eccNand; + + if (diff) { + if ((((diff >> 12) ^ diff) & 0xfff) == 0xfff) { + /* Correctable error */ + if ((diff >> (12 + 3)) < chip->ecc.size) { + dat[diff >> (12 + 3)] ^= BIT((diff >> 12) & 7); + return 1; + } else { + return -1; + } + } else if (!(diff & (diff - 1))) { + /* Single bit ECC error in the ECC itself, + * nothing to fix */ + return 1; + } else { + /* Uncorrectable error */ + return -1; + } + + } + return 0; +} + +/*----------------------------------------------------------------------*/ + +/* + * NOTE: NAND boot requires ALE == EM_A[1], CLE == EM_A[2], so that's + * how these chips are normally wired. This translates to both 8 and 16 + * bit busses using ALE == BIT(3) in byte addresses, and CLE == BIT(4). + * + * For now we assume that configuration, or any other one which ignores + * the two LSBs for NAND access ... so we can issue 32-bit reads/writes + * and have that transparently morphed into multiple NAND operations. + */ +static void nand_davinci_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) +{ + struct nand_chip *chip = mtd->priv; + + if ((0x03 & ((unsigned)buf)) == 0 && (0x03 & len) == 0) + ioread32_rep(chip->IO_ADDR_R, buf, len >> 2); + else if ((0x01 & ((unsigned)buf)) == 0 && (0x01 & len) == 0) + ioread16_rep(chip->IO_ADDR_R, buf, len >> 1); + else + ioread8_rep(chip->IO_ADDR_R, buf, len); +} + +static void nand_davinci_write_buf(struct mtd_info *mtd, + const uint8_t *buf, int len) +{ + struct nand_chip *chip = mtd->priv; + + if ((0x03 & ((unsigned)buf)) == 0 && (0x03 & len) == 0) + iowrite32_rep(chip->IO_ADDR_R, buf, len >> 2); + else if ((0x01 & ((unsigned)buf)) == 0 && (0x01 & len) == 0) + iowrite16_rep(chip->IO_ADDR_R, buf, len >> 1); + else + iowrite8_rep(chip->IO_ADDR_R, buf, len); +} + +/* + * Check hardware register for wait status. Returns 1 if device is ready, + * 0 if it is still busy. + */ +static int nand_davinci_dev_ready(struct mtd_info *mtd) +{ + struct davinci_nand_info *info = to_davinci_nand(mtd); + + return davinci_nand_readl(info, NANDFSR_OFFSET) & BIT(0); +} + +static void __init nand_dm6446evm_flash_init(struct davinci_nand_info *info) +{ + uint32_t regval, a1cr; + + /* + * NAND FLASH timings @ PLL1 == 459 MHz + * - AEMIF.CLK freq = PLL1/6 = 459/6 = 76.5 MHz + * - AEMIF.CLK period = 1/76.5 MHz = 13.1 ns + */ + regval = 0 + | (0 << 31) /* selectStrobe */ + | (0 << 30) /* extWait (never with NAND) */ + | (1 << 26) /* writeSetup 10 ns */ + | (3 << 20) /* writeStrobe 40 ns */ + | (1 << 17) /* writeHold 10 ns */ + | (0 << 13) /* readSetup 10 ns */ + | (3 << 7) /* readStrobe 60 ns */ + | (0 << 4) /* readHold 10 ns */ + | (3 << 2) /* turnAround ?? ns */ + | (0 << 0) /* asyncSize 8-bit bus */ + ; + a1cr = davinci_nand_readl(info, A1CR_OFFSET); + if (a1cr != regval) { + dev_dbg(info->dev, "Warning: NAND config: Set A1CR " \ + "reg to 0x%08x, was 0x%08x, should be done by " \ + "bootloader.\n", regval, a1cr); + davinci_nand_writel(info, A1CR_OFFSET, regval); + } +} + +/*----------------------------------------------------------------------*/ + +static int __init nand_davinci_probe(struct platform_device *pdev) +{ + struct davinci_nand_pdata *pdata = pdev->dev.platform_data; + struct davinci_nand_info *info; + struct resource *res1; + struct resource *res2; + void __iomem *vaddr; + void __iomem *base; + int ret; + uint32_t val; + nand_ecc_modes_t ecc_mode; + + /* which external chipselect will we be managing? */ + if (pdev->id < 0 || pdev->id > 3) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + dev_err(&pdev->dev, "unable to allocate memory\n"); + ret = -ENOMEM; + goto err_nomem; + } + + platform_set_drvdata(pdev, info); + + res1 = platform_get_resource(pdev, IORESOURCE_MEM, 0); + res2 = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!res1 || !res2) { + dev_err(&pdev->dev, "resource missing\n"); + ret = -EINVAL; + goto err_nomem; + } + + vaddr = ioremap(res1->start, res1->end - res1->start); + base = ioremap(res2->start, res2->end - res2->start); + if (!vaddr || !base) { + dev_err(&pdev->dev, "ioremap failed\n"); + ret = -EINVAL; + goto err_ioremap; + } + + info->dev = &pdev->dev; + info->base = base; + info->vaddr = vaddr; + + info->mtd.priv = &info->chip; + info->mtd.name = dev_name(&pdev->dev); + info->mtd.owner = THIS_MODULE; + + info->chip.IO_ADDR_R = vaddr; + info->chip.IO_ADDR_W = vaddr; + info->chip.chip_delay = 0; + info->chip.select_chip = nand_davinci_select_chip; + + /* options such as NAND_USE_FLASH_BBT or 16-bit widths */ + info->chip.options = pdata ? pdata->options : 0; + + info->ioaddr = (uint32_t __force) vaddr; + + info->current_cs = info->ioaddr; + info->core_chipsel = pdev->id; + info->mask_chipsel = pdata->mask_chipsel; + + /* use nandboot-capable ALE/CLE masks by default */ + if (pdata && pdata->mask_ale) + info->mask_ale = pdata->mask_cle; + else + info->mask_ale = MASK_ALE; + if (pdata && pdata->mask_cle) + info->mask_cle = pdata->mask_cle; + else + info->mask_cle = MASK_CLE; + + /* Set address of hardware control function */ + info->chip.cmd_ctrl = nand_davinci_hwcontrol; + info->chip.dev_ready = nand_davinci_dev_ready; + + /* Speed up buffer I/O */ + info->chip.read_buf = nand_davinci_read_buf; + info->chip.write_buf = nand_davinci_write_buf; + + /* use board-specific ECC config; else, the best available */ + if (pdata) + ecc_mode = pdata->ecc_mode; + else if (cpu_is_davinci_dm355()) + ecc_mode = NAND_ECC_HW_SYNDROME; + else + ecc_mode = NAND_ECC_HW; + + switch (ecc_mode) { + case NAND_ECC_NONE: + case NAND_ECC_SOFT: + break; + case NAND_ECC_HW: + info->chip.ecc.calculate = nand_davinci_calculate_1bit; + info->chip.ecc.correct = nand_davinci_correct_1bit; + info->chip.ecc.hwctl = nand_davinci_hwctl_1bit; + info->chip.ecc.size = 512; + info->chip.ecc.bytes = 3; + break; + case NAND_ECC_HW_SYNDROME: + /* FIXME implement */ + info->chip.ecc.size = 512; + info->chip.ecc.bytes = 10; + + dev_warn(&pdev->dev, "4-bit ECC nyet supported\n"); + /* FALL THROUGH */ + default: + ret = -EINVAL; + goto err_ecc; + } + info->chip.ecc.mode = ecc_mode; + + info->clk = clk_get(&pdev->dev, "AEMIFCLK"); + if (IS_ERR(info->clk)) { + ret = PTR_ERR(info->clk); + dev_dbg(&pdev->dev, "unable to get AEMIFCLK, err %d\n", ret); + goto err_clk; + } + + ret = clk_enable(info->clk); + if (ret < 0) { + dev_dbg(&pdev->dev, "unable to enable AEMIFCLK, err %d\n", ret); + goto err_clk_enable; + } + + /* EMIF timings should normally be set by the boot loader, + * especially after boot-from-NAND. The *only* reason to + * have this special casing for the DM6446 EVM is to work + * with boot-from-NOR ... with CS0 manually re-jumpered + * (after startup) so it addresses the NAND flash, not NOR. + * Even for dev boards, that's unusually rude... + */ + if (machine_is_davinci_evm()) + nand_dm6446evm_flash_init(info); + + spin_lock_irq(&davinci_nand_lock); + + /* put CSxNAND into NAND mode */ + val = davinci_nand_readl(info, NANDFCR_OFFSET); + val |= BIT(info->core_chipsel); + davinci_nand_writel(info, NANDFCR_OFFSET, val); + + spin_unlock_irq(&davinci_nand_lock); + + /* Scan to find existence of the device(s) */ + ret = nand_scan(&info->mtd, pdata->mask_chipsel ? 2 : 1); + if (ret < 0) { + dev_dbg(&pdev->dev, "no NAND chip(s) found\n"); + goto err_scan; + } + + if (mtd_has_partitions()) { + struct mtd_partition *mtd_parts = NULL; + int mtd_parts_nb = 0; + + if (mtd_has_cmdlinepart()) { + static const char *probes[] __initconst = + { "cmdlinepart", NULL }; + + const char *master_name; + + /* Set info->mtd.name = 0 temporarily */ + master_name = info->mtd.name; + info->mtd.name = (char *)0; + + /* info->mtd.name == 0, means: don't bother checking + */ + mtd_parts_nb = parse_mtd_partitions(&info->mtd, probes, + &mtd_parts, 0); + + /* Restore info->mtd.name */ + info->mtd.name = master_name; + } + + if (mtd_parts_nb <= 0 && pdata) { + mtd_parts = pdata->parts; + mtd_parts_nb = pdata->nr_parts; + } + + /* Register any partitions */ + if (mtd_parts_nb > 0) { + ret = add_mtd_partitions(&info->mtd, + mtd_parts, mtd_parts_nb); + if (ret == 0) + info->partitioned = true; + } + + } else if (pdata && pdata->nr_parts) { + dev_warn(&pdev->dev, "ignoring %d default partitions on %s\n", + pdata->nr_parts, info->mtd.name); + } + + /* If there's no partition info, just package the whole chip + * as a single MTD device. + */ + if (!info->partitioned) + ret = add_mtd_device(&info->mtd) ? -ENODEV : 0; + + if (ret < 0) + goto err_scan; + + val = davinci_nand_readl(info, NRCSR_OFFSET); + dev_info(&pdev->dev, "controller rev. %d.%d\n", + (val >> 8) & 0xff, val & 0xff); + + return 0; + +err_scan: + clk_disable(info->clk); + +err_clk_enable: + clk_put(info->clk); + +err_ecc: +err_clk: +err_ioremap: + if (base) + iounmap(base); + if (vaddr) + iounmap(vaddr); + +err_nomem: + kfree(info); + return ret; +} + +static int __exit nand_davinci_remove(struct platform_device *pdev) +{ + struct davinci_nand_info *info = platform_get_drvdata(pdev); + int status; + + if (mtd_has_partitions() && info->partitioned) + status = del_mtd_partitions(&info->mtd); + else + status = del_mtd_device(&info->mtd); + + iounmap(info->base); + iounmap(info->vaddr); + + nand_release(&info->mtd); + + clk_disable(info->clk); + clk_put(info->clk); + + kfree(info); + + return 0; +} + +static struct platform_driver nand_davinci_driver = { + .remove = __exit_p(nand_davinci_remove), + .driver = { + .name = "davinci_nand", + }, +}; +MODULE_ALIAS("platform:davinci_nand"); + +static int __init nand_davinci_init(void) +{ + return platform_driver_probe(&nand_davinci_driver, nand_davinci_probe); +} +module_init(nand_davinci_init); + +static void __exit nand_davinci_exit(void) +{ + platform_driver_unregister(&nand_davinci_driver); +} +module_exit(nand_davinci_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Texas Instruments"); +MODULE_DESCRIPTION("Davinci NAND flash driver"); + -- cgit v1.2.3-59-g8ed1b From 374555aeb64f450db04edb8c7c218d763f2a8781 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 12:01:38 -0800 Subject: [MTD] [NAND] fix broken debug messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix incorrect debug messages (*write* not read); someone committed some cut'n'paste bugs.  There might be more, I only noticed these since I was looking for nand_read usage and landed in some very wrong functions. IMO all MTD debugging message framework is goofed, anyway. It uses "DEBUG" in a way that's incompatible with usage most everywhere else in the kernel, and which prevents normal pr_dbg() and dev_dbg() calls from working right. [True. It predates those by a long way, and should probably be updated to use them. dwmw2] Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/nand_base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index d902370df3e0..0793ca39cc88 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1946,7 +1946,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, } if (unlikely(ops->ooboffs >= len)) { - DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: " + DEBUG(MTD_DEBUG_LEVEL0, "nand_do_write_oob: " "Attempt to start write outside oob\n"); return -EINVAL; } @@ -1956,7 +1956,7 @@ static int nand_do_write_oob(struct mtd_info *mtd, loff_t to, ops->ooboffs + ops->ooblen > ((mtd->size >> chip->page_shift) - (to >> chip->page_shift)) * len)) { - DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: " + DEBUG(MTD_DEBUG_LEVEL0, "nand_do_write_oob: " "Attempt write beyond end of device\n"); return -EINVAL; } @@ -2012,8 +2012,8 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, /* Do not allow writes past end of device */ if (ops->datbuf && (to + ops->len) > mtd->size) { - DEBUG(MTD_DEBUG_LEVEL0, "nand_read_oob: " - "Attempt read beyond end of device\n"); + DEBUG(MTD_DEBUG_LEVEL0, "nand_write_oob: " + "Attempt write beyond end of device\n"); return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From d5e539ad7d8305f12d04b4a278f8cf791e3de4db Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 4 Mar 2009 12:01:39 -0800 Subject: [MTD] [NAND] davinci: drop usage of cpu_is_* macro Usage of davinci-specific cpu_is macros is not allowed in drivers. These options should be passed in through platform_data. Signed-off-by: Kevin Hilman Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/davinci_nand.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c index 4bedd8dec61e..4034ac945041 100644 --- a/drivers/mtd/nand/davinci_nand.c +++ b/drivers/mtd/nand/davinci_nand.c @@ -33,7 +33,6 @@ #include #include -#include #include #include @@ -392,8 +391,6 @@ static int __init nand_davinci_probe(struct platform_device *pdev) /* use board-specific ECC config; else, the best available */ if (pdata) ecc_mode = pdata->ecc_mode; - else if (cpu_is_davinci_dm355()) - ecc_mode = NAND_ECC_HW_SYNDROME; else ecc_mode = NAND_ECC_HW; -- cgit v1.2.3-59-g8ed1b From 7ed8c7d440d497913a2218831a67b5897e0e86e1 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 12:01:40 -0800 Subject: [MTD] we don't need no misc devices Remove from various drivers which don't actually use any of its contents. There are still a number of these left in arch-specific bits of the tree. (Found by diffing results of "grep -rl" for linux/miscdevice.h and for misc_register, examining the differences, and verifying removals with a build test.) Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/devices/doc2000.c | 1 - drivers/mtd/devices/doc2001.c | 1 - drivers/mtd/devices/doc2001plus.c | 1 - drivers/mtd/devices/docecc.c | 1 - drivers/mtd/inftlmount.c | 1 - drivers/mtd/nftlcore.c | 1 - 6 files changed, 6 deletions(-) diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c index 50de839c77a9..5bf5f460e132 100644 --- a/drivers/mtd/devices/doc2000.c +++ b/drivers/mtd/devices/doc2000.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c index e32c568c1145..0990f7803628 100644 --- a/drivers/mtd/devices/doc2001.c +++ b/drivers/mtd/devices/doc2001.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c index d853f891b586..719b2915dc3a 100644 --- a/drivers/mtd/devices/doc2001plus.c +++ b/drivers/mtd/devices/doc2001plus.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mtd/devices/docecc.c b/drivers/mtd/devices/docecc.c index 874e51b110a2..a19cda52da5c 100644 --- a/drivers/mtd/devices/docecc.c +++ b/drivers/mtd/devices/docecc.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c index f751dd97c549..32e82aef3e53 100644 --- a/drivers/mtd/inftlmount.c +++ b/drivers/mtd/inftlmount.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c index d1c4546513f7..f5bcd4969462 100644 --- a/drivers/mtd/nftlcore.c +++ b/drivers/mtd/nftlcore.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From a4b6d516a6079c6ba8dc97d185371439035a35d0 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 4 Mar 2009 12:01:41 -0800 Subject: [MTD] partitioning utility predicates Move mtd_has_partitions() and mtd_has_cmdlinepart() inlines from a DaVinci-specific driver to the header. Use those to eliminate #ifdefs in two drivers which had their own definitions of mtd_has_partitions(). Quite a lot of other MTD drivers could benefit from using use one or both of these to remove #ifdeffery. Maybe some Janitors would like to help. Signed-off-by: David Brownell Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/devices/m25p80.c | 17 ++++++----------- drivers/mtd/devices/mtd_dataflash.c | 16 ++++++---------- drivers/mtd/nand/davinci_nand.c | 13 ------------- include/linux/mtd/partitions.h | 12 ++++++++++++ 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 7c3fc766dcf1..98b0faf6696d 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -65,12 +65,6 @@ #define FAST_READ_DUMMY_BYTE 0 #endif -#ifdef CONFIG_MTD_PARTITIONS -#define mtd_has_partitions() (1) -#else -#define mtd_has_partitions() (0) -#endif - /****************************************************************************/ struct m25p { @@ -708,12 +702,13 @@ static int __devinit m25p_probe(struct spi_device *spi) struct mtd_partition *parts = NULL; int nr_parts = 0; -#ifdef CONFIG_MTD_CMDLINE_PARTS - static const char *part_probes[] = { "cmdlinepart", NULL, }; + if (mtd_has_cmdlinepart()) { + static const char *part_probes[] + = { "cmdlinepart", NULL, }; - nr_parts = parse_mtd_partitions(&flash->mtd, - part_probes, &parts, 0); -#endif + nr_parts = parse_mtd_partitions(&flash->mtd, + part_probes, &parts, 0); + } if (nr_parts <= 0 && data && data->parts) { parts = data->parts; diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c index 6d9f810565c8..d95f74a93bce 100644 --- a/drivers/mtd/devices/mtd_dataflash.c +++ b/drivers/mtd/devices/mtd_dataflash.c @@ -98,12 +98,6 @@ struct dataflash { struct mtd_info mtd; }; -#ifdef CONFIG_MTD_PARTITIONS -#define mtd_has_partitions() (1) -#else -#define mtd_has_partitions() (0) -#endif - /* ......................................................................... */ /* @@ -682,11 +676,13 @@ add_dataflash_otp(struct spi_device *spi, char *name, struct mtd_partition *parts; int nr_parts = 0; -#ifdef CONFIG_MTD_CMDLINE_PARTS - static const char *part_probes[] = { "cmdlinepart", NULL, }; + if (mtd_has_cmdlinepart()) { + static const char *part_probes[] + = { "cmdlinepart", NULL, }; - nr_parts = parse_mtd_partitions(device, part_probes, &parts, 0); -#endif + nr_parts = parse_mtd_partitions(device, + part_probes, &parts, 0); + } if (nr_parts <= 0 && pdata && pdata->parts) { parts = pdata->parts; diff --git a/drivers/mtd/nand/davinci_nand.c b/drivers/mtd/nand/davinci_nand.c index 4034ac945041..81f7ecd23c60 100644 --- a/drivers/mtd/nand/davinci_nand.c +++ b/drivers/mtd/nand/davinci_nand.c @@ -38,19 +38,6 @@ #include -#ifdef CONFIG_MTD_PARTITIONS -static inline int mtd_has_partitions(void) { return 1; } -#else -static inline int mtd_has_partitions(void) { return 0; } -#endif - -#ifdef CONFIG_MTD_CMDLINE_PARTS -static inline int mtd_has_cmdlinepart(void) { return 1; } -#else -static inline int mtd_has_cmdlinepart(void) { return 0; } -#endif - - /* * This is a device driver for the NAND flash controller found on the * various DaVinci family chips. It handles up to four SoC chipselects, diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index a45dd831b3f8..7535a74083b9 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -76,4 +76,16 @@ int __devinit of_mtd_parse_partitions(struct device *dev, struct device_node *node, struct mtd_partition **pparts); +#ifdef CONFIG_MTD_PARTITIONS +static inline int mtd_has_partitions(void) { return 1; } +#else +static inline int mtd_has_partitions(void) { return 0; } +#endif + +#ifdef CONFIG_MTD_CMDLINE_PARTS +static inline int mtd_has_cmdlinepart(void) { return 1; } +#else +static inline int mtd_has_cmdlinepart(void) { return 0; } +#endif + #endif -- cgit v1.2.3-59-g8ed1b From fc371a25eab8816d49c2d322d91b48a11e206018 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Wed, 4 Mar 2009 12:01:41 -0800 Subject: [JFFS2] jffs2_acl_count() tests < 0 on unsigned size_t s is unsigned and cannot be less than 0. Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- fs/jffs2/acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index d98713777a1b..6e63e8b41066 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size) size_t s; size -= sizeof(struct jffs2_acl_header); - s = size - 4 * sizeof(struct jffs2_acl_entry_short); - if (s < 0) { + if (size < 4 * sizeof(struct jffs2_acl_entry_short)) { if (size % sizeof(struct jffs2_acl_entry_short)) return -1; return size / sizeof(struct jffs2_acl_entry_short); } else { + s = size - 4 * sizeof(struct jffs2_acl_entry_short); if (s % sizeof(struct jffs2_acl_entry)) return -1; return s / sizeof(struct jffs2_acl_entry) + 4; -- cgit v1.2.3-59-g8ed1b From 7d4e9ccb435e51e013e63abd340b4f496428139c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 20 Mar 2009 19:11:12 +0200 Subject: UBIFS: fix commentaries Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 4 ++-- fs/ubifs/debug.c | 2 +- fs/ubifs/file.c | 2 +- fs/ubifs/journal.c | 2 +- fs/ubifs/log.c | 2 +- fs/ubifs/lpt_commit.c | 2 +- fs/ubifs/replay.c | 2 +- fs/ubifs/tnc.c | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 8cd425b628ee..af1914462f02 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -322,8 +322,8 @@ static int can_use_rp(struct ubifs_info *c) * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be - * consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 93f6532eff00..ce2cd8343618 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) /* * Make sure the last key in our znode is less or - * equivalent than the the key in zbranch which goes + * equivalent than the key in the zbranch which goes * after our pointing zbranch. */ cmp = keys_cmp(c, max, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 4e7f0aca9ebc..4e256b8f56b2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -959,7 +959,7 @@ static int do_writepage(struct page *page, int len) * whole index and correct all inode sizes, which is long an unacceptable. * * To prevent situations like this, UBIFS writes pages back only if they are - * within last synchronized inode size, i.e. the the size which has been + * within the last synchronized inode size, i.e. the size which has been * written to the flash media last time. Otherwise, UBIFS forces inode * write-back, thus making sure the on-flash inode contains current inode size, * and then keeps writing pages back. diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index a2d334eccbca..64b5f3a309f5 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -1365,7 +1365,7 @@ out_ro: * @host: host inode * * This function writes the updated version of an extended attribute inode and - * the host inode tho the journal (to the base head). The host inode is written + * the host inode to the journal (to the base head). The host inode is written * after the extended attribute inode in order to guarantee that the extended * attribute will be flushed when the inode is synchronized by 'fsync()' and * consequently, the write-buffer is synchronized. This function returns zero diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 1004261dc864..56e33772a1ee 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) } /* - * Make sure the the amount of space in buds will not exceed + * Make sure the amount of space in buds will not exceed the * 'c->max_bud_bytes' limit, because we want to guarantee mount time * limits. * diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 9d77f68b2f8e..8cbfb8248025 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1,4 +1,4 @@ - /* +/* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation. diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ce42a7b0ca5a..11cc80125a49 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) dirty -= c->leb_size - lp->free; /* * If the replay order was perfect the dirty space would now be - * zero. The order is not perfect because the the journal heads + * zero. The order is not perfect because the journal heads * race with each other. This is not a problem but is does mean * that the dirty space may temporarily exceed c->leb_size * during the replay. diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa28a84c6a1b..f249f7b0d656 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, * splitting in the middle of the colliding sequence. Also, when * removing the leftmost key, we would have to correct the key of the * parent node, which would introduce additional complications. Namely, - * if we changed the the leftmost key of the parent znode, the garbage + * if we changed the leftmost key of the parent znode, the garbage * collector would be unable to find it (GC is doing this when GC'ing * indexing LEBs). Although we already have an additional RB-tree where * we save such changed znodes (see 'ins_clr_old_idx_znode()') until -- cgit v1.2.3-59-g8ed1b From f10770f5e56b4297701fd7c3e551b206f98d7ac2 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 8 Mar 2009 15:13:00 +0200 Subject: UBIFS: fully sort GCed nodes The 'joinup()' function cannot deal with situations when nodes go in reverse order - it just leaves them in this order. This patch implement full nodes sorting using n*log(n) algorithm. It sorts data nodes for bulk-read, and direntry nodes for readdir(). Signed-off-by: Artem Bityutskiy --- fs/ubifs/gc.c | 428 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 296 insertions(+), 132 deletions(-) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a711d33b3d3e..f0f5f15d384e 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -47,7 +47,7 @@ * have to waste large pieces of free space at the end of LEB B, because nodes * from LEB A would not fit. And the worst situation is when all nodes are of * maximum size. So dark watermark is the amount of free + dirty space in LEB - * which are guaranteed to be reclaimable. If LEB has less space, the GC migh + * which are guaranteed to be reclaimable. If LEB has less space, the GC might * be unable to reclaim it. So, LEBs with free + dirty greater than dark * watermark are "good" LEBs from GC's point of few. The other LEBs are not so * good, and GC takes extra care when moving them. @@ -56,14 +56,6 @@ #include #include "ubifs.h" -/* - * GC tries to optimize the way it fit nodes to available space, and it sorts - * nodes a little. The below constants are watermarks which define "large", - * "medium", and "small" nodes. - */ -#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) -#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ - /* * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector @@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * joinup - bring data nodes for an inode together. - * @c: UBIFS file-system description object - * @sleb: describes scanned LEB - * @inum: inode number - * @blk: block number - * @data: list to which to add data nodes + * list_sort - sort a list. + * @priv: private data, passed to @cmp + * @head: the list to sort + * @cmp: the elements comparison function * - * This function looks at the first few nodes in the scanned LEB @sleb and adds - * them to @data if they are data nodes from @inum and have a larger block - * number than @blk. This function returns %0 on success and a negative error - * code on failure. + * This function has been implemented by Mark J Roberts . It + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted + * in ascending order. + * + * The comparison function @cmp is supposed to return a negative value if @a is + * than @b, and a positive value if @a is greater than @b. If @a and @b are + * equivalent, then it does not matter what this function returns. */ -static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, - unsigned int blk, struct list_head *data) +static void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) { - int err, cnt = 6, lnum = sleb->lnum, offs; - struct ubifs_scan_node *snod, *tmp; - union ubifs_key *key; + struct list_head *p, *q, *e, *list, *tail, *oldhead; + int insize, nmerges, psize, qsize, i; + + if (list_empty(head)) + return; + + list = head->next; + list_del(head); + insize = 1; + for (;;) { + p = oldhead = list; + list = tail = NULL; + nmerges = 0; + + while (p) { + nmerges++; + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next == oldhead ? NULL : q->next; + if (!q) + break; + } - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - key = &snod->key; - if (key_inum(c, key) == inum && - key_type(c, key) == UBIFS_DATA_KEY && - key_block(c, key) > blk) { - offs = snod->offs; - err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); - if (err < 0) - return err; - list_del(&snod->list); - if (err) { - list_add_tail(&snod->list, data); - blk = key_block(c, key); - } else - kfree(snod); - cnt = 6; - } else if (--cnt == 0) + qsize = insize; + while (psize > 0 || (qsize > 0 && q)) { + if (!psize) { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } else if (!qsize || !q) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else if (cmp(priv, p, q) <= 0) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } + if (tail) + tail->next = e; + else + list = e; + e->prev = tail; + tail = e; + } + p = q; + } + + tail->next = list; + list->prev = tail; + + if (nmerges <= 1) break; + + insize *= 2; } - return 0; + + head->next = list; + head->prev = list->prev; + list->prev->next = head; + list->prev = head; } /** - * move_nodes - move nodes. + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + int typea, typeb; + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + typea = key_type(c, &sa->key); + typeb = key_type(c, &sb->key); + ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); + + /* Inodes go before directory entries */ + if (typea == UBIFS_INO_KEY) { + if (typeb == UBIFS_INO_KEY) + return sb->len - sa->len; + return -1; + } + if (typeb == UBIFS_INO_KEY) + return 1; + + ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. * @c: UBIFS file-system description object - * @sleb: describes nodes to move + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here * - * This function moves valid nodes from data LEB described by @sleb to the GC - * journal head. The obsolete nodes are dropped. + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; * - * When moving nodes we have to deal with classical bin-packing problem: the - * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", - * where the nodes in the @sleb->nodes list are the elements which should be - * fit optimally to the bins. This function uses the "first fit decreasing" - * strategy, although it does not really sort the nodes but just split them on - * 3 classes - large, medium, and small, so they are roughly sorted. + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. * - * This function returns zero in case of success, %-EAGAIN if commit is - * required, and other negative error codes in case of other failures. + * This function returns zero in case of success and a negative error code in + * case of failure. */ -static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) { struct ubifs_scan_node *snod, *tmp; - struct list_head data, large, medium, small; - struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; - int avail, err, min = INT_MAX; - unsigned int blk = 0; - ino_t inum = 0; - INIT_LIST_HEAD(&data); - INIT_LIST_HEAD(&large); - INIT_LIST_HEAD(&medium); - INIT_LIST_HEAD(&small); + *min = INT_MAX; - while (!list_empty(&sleb->nodes)) { - struct list_head *lst = sleb->nodes.next; - - snod = list_entry(lst, struct ubifs_scan_node, list); + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + int err; ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, snod->offs, 0); if (err < 0) - goto out; + return err; - list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ + list_del(&snod->list); kfree(snod); continue; } - /* - * Sort the list of nodes so that data nodes go first, large - * nodes go second, and small nodes go last. - */ - if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { - if (inum != key_inum(c, &snod->key)) { - if (inum) { - /* - * Try to move data nodes from the same - * inode together. - */ - err = joinup(c, sleb, inum, blk, &data); - if (err) - goto out; - } - inum = key_inum(c, &snod->key); - blk = key_block(c, &snod->key); - } - list_add_tail(lst, &data); - } else if (snod->len > MEDIUM_NODE_WM) - list_add_tail(lst, &large); - else if (snod->len > SMALL_NODE_WM) - list_add_tail(lst, &medium); - else - list_add_tail(lst, &small); - - /* And find the smallest node */ - if (snod->len < min) - min = snod->len; + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); } - /* - * Join the tree lists so that we'd have one roughly sorted list - * ('large' will be the head of the joined list). - */ - list_splice(&data, &large); - list_splice(&medium, large.prev); - list_splice(&small, large.prev); + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; if (wbuf->lnum == -1) { /* @@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) */ err = switch_gc_head(c); if (err) - goto out; + return err; } + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + /* Write nodes to their new location. Use the first-fit strategy */ while (1) { - avail = c->leb_size - wbuf->offs - wbuf->used; - list_for_each_entry_safe(snod, tmp, &large, list) { - int new_lnum, new_offs; + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; if (avail < min) break; - if (snod->len > avail) - /* This node does not fit */ + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; continue; + } - cond_resched(); - - new_lnum = wbuf->lnum; - new_offs = wbuf->offs + wbuf->used; - err = ubifs_wbuf_write_nolock(wbuf, snod->node, - snod->len); + err = move_node(c, sleb, snod, wbuf); if (err) goto out; - err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, - snod->offs, new_lnum, new_offs, - snod->len); - if (err) - goto out; - - avail = c->leb_size - wbuf->offs - wbuf->used; - list_del(&snod->list); - kfree(snod); } - if (list_empty(&large)) + if (list_empty(&sleb->nodes) && list_empty(&nondata)) break; /* @@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) return 0; out: - list_for_each_entry_safe(snod, tmp, &large, list) { - list_del(&snod->list); - kfree(snod); - } + list_splice_tail(&nondata, &sleb->nodes); return err; } -- cgit v1.2.3-59-g8ed1b From fcabb3479e2b15abfd2d2ef5363295f16e98b2d7 Mon Sep 17 00:00:00 2001 From: Hunter Adrian Date: Wed, 18 Mar 2009 12:29:39 +0100 Subject: UBIFS: fix compiler warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fs/ubifs/super.c: In function ‘ubifs_show_options’: fs/ubifs/super.c:425: warning: format not a string literal and no format arguments fs/ubifs/super.c: In function ‘mount_ubifs’: fs/ubifs/super.c:1204: warning: format not a string literal and no format arguments fs/ubifs/super.c: In function ‘ubifs_remount_rw’: fs/ubifs/super.c:1557: warning: format not a string literal and no format arguments Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7bdd248ec770..372c7fb66531 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { - seq_printf(s, ",compr="); - seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); } return 0; @@ -1204,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_cbuf; /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1561,7 +1561,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) ubifs_create_buds_lists(c); /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; -- cgit v1.2.3-59-g8ed1b From a591f5d35e89be90c04830d7de01c276af68aeb7 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 4 Mar 2009 12:01:31 -0800 Subject: [MTD] [NAND] TXx9: add NDFMC support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add platform support for NAND Flash Memory Controller of TXx9 SoCs. Signed-off-by: Atsushi Nemoto Acked-By: Ralf Bächle Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- arch/mips/include/asm/txx9/ndfmc.h | 30 ++++++++++++++++++++++++++++++ arch/mips/include/asm/txx9/tx4938.h | 1 + arch/mips/include/asm/txx9/tx4939.h | 2 ++ arch/mips/txx9/generic/setup.c | 21 +++++++++++++++++++++ arch/mips/txx9/generic/setup_tx4938.c | 21 +++++++++++++++++++++ arch/mips/txx9/generic/setup_tx4939.c | 17 +++++++++++++++++ arch/mips/txx9/rbtx4938/setup.c | 2 ++ arch/mips/txx9/rbtx4939/setup.c | 4 ++++ 8 files changed, 98 insertions(+) create mode 100644 arch/mips/include/asm/txx9/ndfmc.h diff --git a/arch/mips/include/asm/txx9/ndfmc.h b/arch/mips/include/asm/txx9/ndfmc.h new file mode 100644 index 000000000000..fa67f3df78fc --- /dev/null +++ b/arch/mips/include/asm/txx9/ndfmc.h @@ -0,0 +1,30 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * (C) Copyright TOSHIBA CORPORATION 2007 + */ +#ifndef __ASM_TXX9_NDFMC_H +#define __ASM_TXX9_NDFMC_H + +#define NDFMC_PLAT_FLAG_USE_BSPRT 0x01 +#define NDFMC_PLAT_FLAG_NO_RSTR 0x02 +#define NDFMC_PLAT_FLAG_HOLDADD 0x04 +#define NDFMC_PLAT_FLAG_DUMMYWRITE 0x08 + +struct txx9ndfmc_platform_data { + unsigned int shift; + unsigned int gbus_clock; + unsigned int hold; /* hold time in nanosecond */ + unsigned int spw; /* strobe pulse width in nanosecond */ + unsigned int flags; + unsigned char ch_mask; /* available channel bitmask */ + unsigned char wp_mask; /* write-protect bitmask */ + unsigned char wide_mask; /* 16bit-nand bitmask */ +}; + +void txx9_ndfmc_init(unsigned long baseaddr, + const struct txx9ndfmc_platform_data *plat_data); + +#endif /* __ASM_TXX9_NDFMC_H */ diff --git a/arch/mips/include/asm/txx9/tx4938.h b/arch/mips/include/asm/txx9/tx4938.h index 0b068154054c..cd8bc2021755 100644 --- a/arch/mips/include/asm/txx9/tx4938.h +++ b/arch/mips/include/asm/txx9/tx4938.h @@ -291,6 +291,7 @@ int tx4938_pcic1_map_irq(const struct pci_dev *dev, u8 slot); void tx4938_setup_pcierr_irq(void); void tx4938_irq_init(void); void tx4938_mtd_init(int ch); +void tx4938_ndfmc_init(unsigned int hold, unsigned int spw); struct tx4938ide_platform_info { /* diff --git a/arch/mips/include/asm/txx9/tx4939.h b/arch/mips/include/asm/txx9/tx4939.h index 964ef7ede268..f02c50b3abfb 100644 --- a/arch/mips/include/asm/txx9/tx4939.h +++ b/arch/mips/include/asm/txx9/tx4939.h @@ -542,5 +542,7 @@ int tx4939_irq(void); void tx4939_mtd_init(int ch); void tx4939_ata_init(void); void tx4939_rtc_init(void); +void tx4939_ndfmc_init(unsigned int hold, unsigned int spw, + unsigned char ch_mask, unsigned char wide_mask); #endif /* __ASM_TXX9_TX4939_H */ diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c index a13a08b8c9ec..8a266c6a3f58 100644 --- a/arch/mips/txx9/generic/setup.c +++ b/arch/mips/txx9/generic/setup.c @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef CONFIG_CPU_TX49XX #include #endif @@ -691,6 +692,26 @@ void __init txx9_physmap_flash_init(int no, unsigned long addr, #endif } +void __init txx9_ndfmc_init(unsigned long baseaddr, + const struct txx9ndfmc_platform_data *pdata) +{ +#if defined(CONFIG_MTD_NAND_TXX9NDFMC) || \ + defined(CONFIG_MTD_NAND_TXX9NDFMC_MODULE) + struct resource res = { + .start = baseaddr, + .end = baseaddr + 0x1000 - 1, + .flags = IORESOURCE_MEM, + }; + struct platform_device *pdev = platform_device_alloc("txx9ndfmc", -1); + + if (!pdev || + platform_device_add_resources(pdev, &res, 1) || + platform_device_add_data(pdev, pdata, sizeof(*pdata)) || + platform_device_add(pdev)) + platform_device_put(pdev); +#endif +} + #if defined(CONFIG_LEDS_GPIO) || defined(CONFIG_LEDS_GPIO_MODULE) static DEFINE_SPINLOCK(txx9_iocled_lock); diff --git a/arch/mips/txx9/generic/setup_tx4938.c b/arch/mips/txx9/generic/setup_tx4938.c index 25819ff1c350..f0844f891f0b 100644 --- a/arch/mips/txx9/generic/setup_tx4938.c +++ b/arch/mips/txx9/generic/setup_tx4938.c @@ -23,6 +23,7 @@ #include #include #include +#include #include static void __init tx4938_wdr_init(void) @@ -382,6 +383,26 @@ void __init tx4938_ata_init(unsigned int irq, unsigned int shift, int tune) platform_device_put(pdev); } +void __init tx4938_ndfmc_init(unsigned int hold, unsigned int spw) +{ + struct txx9ndfmc_platform_data plat_data = { + .shift = 1, + .gbus_clock = txx9_gbus_clock, + .hold = hold, + .spw = spw, + .ch_mask = 1, + }; + unsigned long baseaddr = TX4938_NDFMC_REG & 0xfffffffffULL; + +#ifdef __BIG_ENDIAN + baseaddr += 4; +#endif + if ((__raw_readq(&tx4938_ccfgptr->pcfg) & + (TX4938_PCFG_ATA_SEL|TX4938_PCFG_ISA_SEL|TX4938_PCFG_NDF_SEL)) == + TX4938_PCFG_NDF_SEL) + txx9_ndfmc_init(baseaddr, &plat_data); +} + static void __init tx4938_stop_unused_modules(void) { __u64 pcfg, rst = 0, ckd = 0; diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c index 55440967b3a8..7a25b573e9b0 100644 --- a/arch/mips/txx9/generic/setup_tx4939.c +++ b/arch/mips/txx9/generic/setup_tx4939.c @@ -27,6 +27,7 @@ #include #include #include +#include #include static void __init tx4939_wdr_init(void) @@ -457,6 +458,22 @@ void __init tx4939_rtc_init(void) platform_device_register(&rtc_dev); } +void __init tx4939_ndfmc_init(unsigned int hold, unsigned int spw, + unsigned char ch_mask, unsigned char wide_mask) +{ + struct txx9ndfmc_platform_data plat_data = { + .shift = 1, + .gbus_clock = txx9_gbus_clock, + .hold = hold, + .spw = spw, + .flags = NDFMC_PLAT_FLAG_NO_RSTR | NDFMC_PLAT_FLAG_HOLDADD | + NDFMC_PLAT_FLAG_DUMMYWRITE, + .ch_mask = ch_mask, + .wide_mask = wide_mask, + }; + txx9_ndfmc_init(TX4939_NDFMC_REG & 0xfffffffffULL, &plat_data); +} + static void __init tx4939_stop_unused_modules(void) { __u64 pcfg, rst = 0, ckd = 0; diff --git a/arch/mips/txx9/rbtx4938/setup.c b/arch/mips/txx9/rbtx4938/setup.c index 547ff2920bf0..65d13df8878a 100644 --- a/arch/mips/txx9/rbtx4938/setup.c +++ b/arch/mips/txx9/rbtx4938/setup.c @@ -352,6 +352,8 @@ static void __init rbtx4938_device_init(void) rbtx4938_ne_init(); tx4938_wdt_init(); rbtx4938_mtd_init(); + /* TC58DVM82A1FT: tDH=10ns, tWP=tRP=tREADID=35ns */ + tx4938_ndfmc_init(10, 35); tx4938_ata_init(RBTX4938_IRQ_IOC_ATA, 0, 1); txx9_iocled_init(RBTX4938_LED_ADDR - IO_BASE, -1, 8, 1, "green", NULL); } diff --git a/arch/mips/txx9/rbtx4939/setup.c b/arch/mips/txx9/rbtx4939/setup.c index 656603b85b71..74839f2a111e 100644 --- a/arch/mips/txx9/rbtx4939/setup.c +++ b/arch/mips/txx9/rbtx4939/setup.c @@ -333,6 +333,10 @@ static void __init rbtx4939_device_init(void) platform_device_add_data(pdev, &smc_pdata, sizeof(smc_pdata)) || platform_device_add(pdev)) platform_device_put(pdev); + /* TC58DVM82A1FT: tDH=10ns, tWP=tRP=tREADID=35ns */ + tx4939_ndfmc_init(10, 35, + (1 << 1) | (1 << 2), + (1 << 2)); /* ch1:8bit, ch2:16bit */ rbtx4939_led_setup(); tx4939_wdt_init(); tx4939_ata_init(); -- cgit v1.2.3-59-g8ed1b From cbf77c1bd9c79d1742976862d0b2bebaff1ea14d Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 4 Mar 2009 12:01:33 -0800 Subject: [MTD] RBTX4939: add MTD support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add platform support for NOR flash chips on RBTX4939 board. This board has complex flash mappings, controlled by its DIPSW setting. [akpm@linux-foundation.org: Use min_t] Signed-off-by: Atsushi Nemoto Cc: Ralf Bächle Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- arch/mips/include/asm/txx9/rbtx4939.h | 9 ++ arch/mips/txx9/rbtx4939/setup.c | 157 ++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/arch/mips/include/asm/txx9/rbtx4939.h b/arch/mips/include/asm/txx9/rbtx4939.h index 1acf428c0b4f..e517899794a8 100644 --- a/arch/mips/include/asm/txx9/rbtx4939.h +++ b/arch/mips/include/asm/txx9/rbtx4939.h @@ -130,4 +130,13 @@ void rbtx4939_prom_init(void); void rbtx4939_irq_setup(void); +struct mtd_partition; +struct map_info; +struct rbtx4939_flash_data { + unsigned int width; + unsigned int nr_parts; + struct mtd_partition *parts; + void (*map_init)(struct map_info *map); +}; + #endif /* __ASM_TXX9_RBTX4939_H */ diff --git a/arch/mips/txx9/rbtx4939/setup.c b/arch/mips/txx9/rbtx4939/setup.c index 74839f2a111e..011e1e332f47 100644 --- a/arch/mips/txx9/rbtx4939/setup.c +++ b/arch/mips/txx9/rbtx4939/setup.c @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -282,6 +285,159 @@ static void rbtx4939_7segled_putc(unsigned int pos, unsigned char val) __rbtx4939_7segled_putc(pos, val); } +#if defined(CONFIG_MTD_RBTX4939) || defined(CONFIG_MTD_RBTX4939_MODULE) +/* special mapping for boot rom */ +static unsigned long rbtx4939_flash_fixup_ofs(unsigned long ofs) +{ + u8 bdipsw = readb(rbtx4939_bdipsw_addr) & 0x0f; + unsigned char shift; + + if (bdipsw & 8) { + /* BOOT Mode: USER ROM1 / USER ROM2 */ + shift = bdipsw & 3; + /* rotate A[23:22] */ + return (ofs & ~0xc00000) | ((((ofs >> 22) + shift) & 3) << 22); + } +#ifdef __BIG_ENDIAN + if (bdipsw == 0) + /* BOOT Mode: Monitor ROM */ + ofs ^= 0x400000; /* swap A[22] */ +#endif + return ofs; +} + +static map_word rbtx4939_flash_read16(struct map_info *map, unsigned long ofs) +{ + map_word r; + + ofs = rbtx4939_flash_fixup_ofs(ofs); + r.x[0] = __raw_readw(map->virt + ofs); + return r; +} + +static void rbtx4939_flash_write16(struct map_info *map, const map_word datum, + unsigned long ofs) +{ + ofs = rbtx4939_flash_fixup_ofs(ofs); + __raw_writew(datum.x[0], map->virt + ofs); + mb(); /* see inline_map_write() in mtd/map.h */ +} + +static void rbtx4939_flash_copy_from(struct map_info *map, void *to, + unsigned long from, ssize_t len) +{ + u8 bdipsw = readb(rbtx4939_bdipsw_addr) & 0x0f; + unsigned char shift; + ssize_t curlen; + + from += (unsigned long)map->virt; + if (bdipsw & 8) { + /* BOOT Mode: USER ROM1 / USER ROM2 */ + shift = bdipsw & 3; + while (len) { + curlen = min_t(unsigned long, len, + 0x400000 - (from & (0x400000 - 1))); + memcpy(to, + (void *)((from & ~0xc00000) | + ((((from >> 22) + shift) & 3) << 22)), + curlen); + len -= curlen; + from += curlen; + to += curlen; + } + return; + } +#ifdef __BIG_ENDIAN + if (bdipsw == 0) { + /* BOOT Mode: Monitor ROM */ + while (len) { + curlen = min_t(unsigned long, len, + 0x400000 - (from & (0x400000 - 1))); + memcpy(to, (void *)(from ^ 0x400000), curlen); + len -= curlen; + from += curlen; + to += curlen; + } + return; + } +#endif + memcpy(to, (void *)from, len); +} + +static void rbtx4939_flash_map_init(struct map_info *map) +{ + map->read = rbtx4939_flash_read16; + map->write = rbtx4939_flash_write16; + map->copy_from = rbtx4939_flash_copy_from; +} + +static void __init rbtx4939_mtd_init(void) +{ + static struct { + struct platform_device dev; + struct resource res; + struct rbtx4939_flash_data data; + } pdevs[4]; + int i; + static char names[4][8]; + static struct mtd_partition parts[4]; + struct rbtx4939_flash_data *boot_pdata = &pdevs[0].data; + u8 bdipsw = readb(rbtx4939_bdipsw_addr) & 0x0f; + + if (bdipsw & 8) { + /* BOOT Mode: USER ROM1 / USER ROM2 */ + boot_pdata->nr_parts = 4; + for (i = 0; i < boot_pdata->nr_parts; i++) { + sprintf(names[i], "img%d", 4 - i); + parts[i].name = names[i]; + parts[i].size = 0x400000; + parts[i].offset = MTDPART_OFS_NXTBLK; + } + } else if (bdipsw == 0) { + /* BOOT Mode: Monitor ROM */ + boot_pdata->nr_parts = 2; + strcpy(names[0], "big"); + strcpy(names[1], "little"); + for (i = 0; i < boot_pdata->nr_parts; i++) { + parts[i].name = names[i]; + parts[i].size = 0x400000; + parts[i].offset = MTDPART_OFS_NXTBLK; + } + } else { + /* BOOT Mode: ROM Emulator */ + boot_pdata->nr_parts = 2; + parts[0].name = "boot"; + parts[0].offset = 0xc00000; + parts[0].size = 0x400000; + parts[1].name = "user"; + parts[1].offset = 0; + parts[1].size = 0xc00000; + } + boot_pdata->parts = parts; + boot_pdata->map_init = rbtx4939_flash_map_init; + + for (i = 0; i < ARRAY_SIZE(pdevs); i++) { + struct resource *r = &pdevs[i].res; + struct platform_device *dev = &pdevs[i].dev; + + r->start = 0x1f000000 - i * 0x1000000; + r->end = r->start + 0x1000000 - 1; + r->flags = IORESOURCE_MEM; + pdevs[i].data.width = 2; + dev->num_resources = 1; + dev->resource = r; + dev->id = i; + dev->name = "rbtx4939-flash"; + dev->dev.platform_data = &pdevs[i].data; + platform_device_register(dev); + } +} +#else +static void __init rbtx4939_mtd_init(void) +{ +} +#endif + static void __init rbtx4939_arch_init(void) { rbtx4939_pci_setup(); @@ -333,6 +489,7 @@ static void __init rbtx4939_device_init(void) platform_device_add_data(pdev, &smc_pdata, sizeof(smc_pdata)) || platform_device_add(pdev)) platform_device_put(pdev); + rbtx4939_mtd_init(); /* TC58DVM82A1FT: tDH=10ns, tWP=tRP=tREADID=35ns */ tx4939_ndfmc_init(10, 35, (1 << 1) | (1 << 2), -- cgit v1.2.3-59-g8ed1b From 64fb65baffa5b8f6f2eb3f628dec43c22cd1031f Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 4 Mar 2009 12:01:34 -0800 Subject: [MTD] TXx9 SoC NAND Flash Memory Controller driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for the integrated NAND flash controller of the TXx9 family. Once upon a time there were tx4925ndfmc and tx4938ndfmc driver. They were removed due to bitrot in 2005. This new driver is completely rewritten based on a driver in CELF patch archive. Signed-off-by: Atsushi Nemoto Cc: Ralf Bächle Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/nand/Kconfig | 6 + drivers/mtd/nand/Makefile | 1 + drivers/mtd/nand/txx9ndfmc.c | 428 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 435 insertions(+) create mode 100644 drivers/mtd/nand/txx9ndfmc.c diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 30d6fbc30c35..68ae144ce3be 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -434,4 +434,10 @@ config MTD_NAND_DAVINCI Enable the driver for NAND flash chips on Texas Instruments DaVinci processors. +config MTD_NAND_TXX9NDFMC + tristate "NAND Flash support for TXx9 SoC" + depends on SOC_TX4938 || SOC_TX4939 + help + This enables the NAND flash controller on the TXx9 SoCs. + endif # MTD_NAND diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile index 26dc0bef68ae..d228622d5dc0 100644 --- a/drivers/mtd/nand/Makefile +++ b/drivers/mtd/nand/Makefile @@ -37,5 +37,6 @@ obj-$(CONFIG_MTD_NAND_FSL_ELBC) += fsl_elbc_nand.o obj-$(CONFIG_MTD_NAND_FSL_UPM) += fsl_upm.o obj-$(CONFIG_MTD_NAND_SH_FLCTL) += sh_flctl.o obj-$(CONFIG_MTD_NAND_MXC) += mxc_nand.o +obj-$(CONFIG_MTD_NAND_TXX9NDFMC) += txx9ndfmc.o nand-objs := nand_base.o nand_bbt.o diff --git a/drivers/mtd/nand/txx9ndfmc.c b/drivers/mtd/nand/txx9ndfmc.c new file mode 100644 index 000000000000..812479264896 --- /dev/null +++ b/drivers/mtd/nand/txx9ndfmc.c @@ -0,0 +1,428 @@ +/* + * TXx9 NAND flash memory controller driver + * Based on RBTX49xx patch from CELF patch archive. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * (C) Copyright TOSHIBA CORPORATION 2004-2007 + * All Rights Reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* TXX9 NDFMC Registers */ +#define TXX9_NDFDTR 0x00 +#define TXX9_NDFMCR 0x04 +#define TXX9_NDFSR 0x08 +#define TXX9_NDFISR 0x0c +#define TXX9_NDFIMR 0x10 +#define TXX9_NDFSPR 0x14 +#define TXX9_NDFRSTR 0x18 /* not TX4939 */ + +/* NDFMCR : NDFMC Mode Control */ +#define TXX9_NDFMCR_WE 0x80 +#define TXX9_NDFMCR_ECC_ALL 0x60 +#define TXX9_NDFMCR_ECC_RESET 0x60 +#define TXX9_NDFMCR_ECC_READ 0x40 +#define TXX9_NDFMCR_ECC_ON 0x20 +#define TXX9_NDFMCR_ECC_OFF 0x00 +#define TXX9_NDFMCR_CE 0x10 +#define TXX9_NDFMCR_BSPRT 0x04 /* TX4925/TX4926 only */ +#define TXX9_NDFMCR_ALE 0x02 +#define TXX9_NDFMCR_CLE 0x01 +/* TX4939 only */ +#define TXX9_NDFMCR_X16 0x0400 +#define TXX9_NDFMCR_DMAREQ_MASK 0x0300 +#define TXX9_NDFMCR_DMAREQ_NODMA 0x0000 +#define TXX9_NDFMCR_DMAREQ_128 0x0100 +#define TXX9_NDFMCR_DMAREQ_256 0x0200 +#define TXX9_NDFMCR_DMAREQ_512 0x0300 +#define TXX9_NDFMCR_CS_MASK 0x0c +#define TXX9_NDFMCR_CS(ch) ((ch) << 2) + +/* NDFMCR : NDFMC Status */ +#define TXX9_NDFSR_BUSY 0x80 +/* TX4939 only */ +#define TXX9_NDFSR_DMARUN 0x40 + +/* NDFMCR : NDFMC Reset */ +#define TXX9_NDFRSTR_RST 0x01 + +struct txx9ndfmc_priv { + struct platform_device *dev; + struct nand_chip chip; + struct mtd_info mtd; + int cs; + char mtdname[BUS_ID_SIZE + 2]; +}; + +#define MAX_TXX9NDFMC_DEV 4 +struct txx9ndfmc_drvdata { + struct mtd_info *mtds[MAX_TXX9NDFMC_DEV]; + void __iomem *base; + unsigned char hold; /* in gbusclock */ + unsigned char spw; /* in gbusclock */ + struct nand_hw_control hw_control; +#ifdef CONFIG_MTD_PARTITIONS + struct mtd_partition *parts[MAX_TXX9NDFMC_DEV]; +#endif +}; + +static struct platform_device *mtd_to_platdev(struct mtd_info *mtd) +{ + struct nand_chip *chip = mtd->priv; + struct txx9ndfmc_priv *txx9_priv = chip->priv; + return txx9_priv->dev; +} + +static void __iomem *ndregaddr(struct platform_device *dev, unsigned int reg) +{ + struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); + struct txx9ndfmc_platform_data *plat = dev->dev.platform_data; + + return drvdata->base + (reg << plat->shift); +} + +static u32 txx9ndfmc_read(struct platform_device *dev, unsigned int reg) +{ + return __raw_readl(ndregaddr(dev, reg)); +} + +static void txx9ndfmc_write(struct platform_device *dev, + u32 val, unsigned int reg) +{ + __raw_writel(val, ndregaddr(dev, reg)); +} + +static uint8_t txx9ndfmc_read_byte(struct mtd_info *mtd) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + + return txx9ndfmc_read(dev, TXX9_NDFDTR); +} + +static void txx9ndfmc_write_buf(struct mtd_info *mtd, const uint8_t *buf, + int len) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + void __iomem *ndfdtr = ndregaddr(dev, TXX9_NDFDTR); + u32 mcr = txx9ndfmc_read(dev, TXX9_NDFMCR); + + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_WE, TXX9_NDFMCR); + while (len--) + __raw_writel(*buf++, ndfdtr); + txx9ndfmc_write(dev, mcr, TXX9_NDFMCR); +} + +static void txx9ndfmc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + void __iomem *ndfdtr = ndregaddr(dev, TXX9_NDFDTR); + + while (len--) + *buf++ = __raw_readl(ndfdtr); +} + +static int txx9ndfmc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, + int len) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + void __iomem *ndfdtr = ndregaddr(dev, TXX9_NDFDTR); + + while (len--) + if (*buf++ != (uint8_t)__raw_readl(ndfdtr)) + return -EFAULT; + return 0; +} + +static void txx9ndfmc_cmd_ctrl(struct mtd_info *mtd, int cmd, + unsigned int ctrl) +{ + struct nand_chip *chip = mtd->priv; + struct txx9ndfmc_priv *txx9_priv = chip->priv; + struct platform_device *dev = txx9_priv->dev; + struct txx9ndfmc_platform_data *plat = dev->dev.platform_data; + + if (ctrl & NAND_CTRL_CHANGE) { + u32 mcr = txx9ndfmc_read(dev, TXX9_NDFMCR); + + mcr &= ~(TXX9_NDFMCR_CLE | TXX9_NDFMCR_ALE | TXX9_NDFMCR_CE); + mcr |= ctrl & NAND_CLE ? TXX9_NDFMCR_CLE : 0; + mcr |= ctrl & NAND_ALE ? TXX9_NDFMCR_ALE : 0; + /* TXX9_NDFMCR_CE bit is 0:high 1:low */ + mcr |= ctrl & NAND_NCE ? TXX9_NDFMCR_CE : 0; + if (txx9_priv->cs >= 0 && (ctrl & NAND_NCE)) { + mcr &= ~TXX9_NDFMCR_CS_MASK; + mcr |= TXX9_NDFMCR_CS(txx9_priv->cs); + } + txx9ndfmc_write(dev, mcr, TXX9_NDFMCR); + } + if (cmd != NAND_CMD_NONE) + txx9ndfmc_write(dev, cmd & 0xff, TXX9_NDFDTR); + if (plat->flags & NDFMC_PLAT_FLAG_DUMMYWRITE) { + /* dummy write to update external latch */ + if ((ctrl & NAND_CTRL_CHANGE) && cmd == NAND_CMD_NONE) + txx9ndfmc_write(dev, 0, TXX9_NDFDTR); + } + mmiowb(); +} + +static int txx9ndfmc_dev_ready(struct mtd_info *mtd) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + + return !(txx9ndfmc_read(dev, TXX9_NDFSR) & TXX9_NDFSR_BUSY); +} + +static int txx9ndfmc_calculate_ecc(struct mtd_info *mtd, const uint8_t *dat, + uint8_t *ecc_code) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + u32 mcr = txx9ndfmc_read(dev, TXX9_NDFMCR); + + mcr &= ~TXX9_NDFMCR_ECC_ALL; + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_OFF, TXX9_NDFMCR); + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_READ, TXX9_NDFMCR); + ecc_code[1] = txx9ndfmc_read(dev, TXX9_NDFDTR); + ecc_code[0] = txx9ndfmc_read(dev, TXX9_NDFDTR); + ecc_code[2] = txx9ndfmc_read(dev, TXX9_NDFDTR); + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_OFF, TXX9_NDFMCR); + return 0; +} + +static void txx9ndfmc_enable_hwecc(struct mtd_info *mtd, int mode) +{ + struct platform_device *dev = mtd_to_platdev(mtd); + u32 mcr = txx9ndfmc_read(dev, TXX9_NDFMCR); + + mcr &= ~TXX9_NDFMCR_ECC_ALL; + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_RESET, TXX9_NDFMCR); + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_OFF, TXX9_NDFMCR); + txx9ndfmc_write(dev, mcr | TXX9_NDFMCR_ECC_ON, TXX9_NDFMCR); +} + +static void txx9ndfmc_initialize(struct platform_device *dev) +{ + struct txx9ndfmc_platform_data *plat = dev->dev.platform_data; + struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); + int tmout = 100; + + if (plat->flags & NDFMC_PLAT_FLAG_NO_RSTR) + ; /* no NDFRSTR. Write to NDFSPR resets the NDFMC. */ + else { + /* reset NDFMC */ + txx9ndfmc_write(dev, + txx9ndfmc_read(dev, TXX9_NDFRSTR) | + TXX9_NDFRSTR_RST, + TXX9_NDFRSTR); + while (txx9ndfmc_read(dev, TXX9_NDFRSTR) & TXX9_NDFRSTR_RST) { + if (--tmout == 0) { + dev_err(&dev->dev, "reset failed.\n"); + break; + } + udelay(1); + } + } + /* setup Hold Time, Strobe Pulse Width */ + txx9ndfmc_write(dev, (drvdata->hold << 4) | drvdata->spw, TXX9_NDFSPR); + txx9ndfmc_write(dev, + (plat->flags & NDFMC_PLAT_FLAG_USE_BSPRT) ? + TXX9_NDFMCR_BSPRT : 0, TXX9_NDFMCR); +} + +#define TXX9NDFMC_NS_TO_CYC(gbusclk, ns) \ + DIV_ROUND_UP((ns) * DIV_ROUND_UP(gbusclk, 1000), 1000000) + +static int __init txx9ndfmc_probe(struct platform_device *dev) +{ + struct txx9ndfmc_platform_data *plat = dev->dev.platform_data; +#ifdef CONFIG_MTD_PARTITIONS + static const char *probes[] = { "cmdlinepart", NULL }; +#endif + int hold, spw; + int i; + struct txx9ndfmc_drvdata *drvdata; + unsigned long gbusclk = plat->gbus_clock; + struct resource *res; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + drvdata = devm_kzalloc(&dev->dev, sizeof(*drvdata), GFP_KERNEL); + if (!drvdata) + return -ENOMEM; + if (!devm_request_mem_region(&dev->dev, res->start, + resource_size(res), dev_name(&dev->dev))) + return -EBUSY; + drvdata->base = devm_ioremap(&dev->dev, res->start, + resource_size(res)); + if (!drvdata->base) + return -EBUSY; + + hold = plat->hold ?: 20; /* tDH */ + spw = plat->spw ?: 90; /* max(tREADID, tWP, tRP) */ + + hold = TXX9NDFMC_NS_TO_CYC(gbusclk, hold); + spw = TXX9NDFMC_NS_TO_CYC(gbusclk, spw); + if (plat->flags & NDFMC_PLAT_FLAG_HOLDADD) + hold -= 2; /* actual hold time : (HOLD + 2) BUSCLK */ + spw -= 1; /* actual wait time : (SPW + 1) BUSCLK */ + hold = clamp(hold, 1, 15); + drvdata->hold = hold; + spw = clamp(spw, 1, 15); + drvdata->spw = spw; + dev_info(&dev->dev, "CLK:%ldMHz HOLD:%d SPW:%d\n", + (gbusclk + 500000) / 1000000, hold, spw); + + spin_lock_init(&drvdata->hw_control.lock); + init_waitqueue_head(&drvdata->hw_control.wq); + + platform_set_drvdata(dev, drvdata); + txx9ndfmc_initialize(dev); + + for (i = 0; i < MAX_TXX9NDFMC_DEV; i++) { + struct txx9ndfmc_priv *txx9_priv; + struct nand_chip *chip; + struct mtd_info *mtd; +#ifdef CONFIG_MTD_PARTITIONS + int nr_parts; +#endif + + if (!(plat->ch_mask & (1 << i))) + continue; + txx9_priv = kzalloc(sizeof(struct txx9ndfmc_priv), + GFP_KERNEL); + if (!txx9_priv) { + dev_err(&dev->dev, "Unable to allocate " + "TXx9 NDFMC MTD device structure.\n"); + continue; + } + chip = &txx9_priv->chip; + mtd = &txx9_priv->mtd; + mtd->owner = THIS_MODULE; + + mtd->priv = chip; + + chip->read_byte = txx9ndfmc_read_byte; + chip->read_buf = txx9ndfmc_read_buf; + chip->write_buf = txx9ndfmc_write_buf; + chip->verify_buf = txx9ndfmc_verify_buf; + chip->cmd_ctrl = txx9ndfmc_cmd_ctrl; + chip->dev_ready = txx9ndfmc_dev_ready; + chip->ecc.calculate = txx9ndfmc_calculate_ecc; + chip->ecc.correct = nand_correct_data; + chip->ecc.hwctl = txx9ndfmc_enable_hwecc; + chip->ecc.mode = NAND_ECC_HW; + chip->ecc.size = 256; + chip->ecc.bytes = 3; + chip->chip_delay = 100; + chip->controller = &drvdata->hw_control; + + chip->priv = txx9_priv; + txx9_priv->dev = dev; + + if (plat->ch_mask != 1) { + txx9_priv->cs = i; + sprintf(txx9_priv->mtdname, "%s.%u", + dev_name(&dev->dev), i); + } else { + txx9_priv->cs = -1; + strcpy(txx9_priv->mtdname, dev_name(&dev->dev)); + } + if (plat->wide_mask & (1 << i)) + chip->options |= NAND_BUSWIDTH_16; + + if (nand_scan(mtd, 1)) { + kfree(txx9_priv); + continue; + } + mtd->name = txx9_priv->mtdname; + +#ifdef CONFIG_MTD_PARTITIONS + nr_parts = parse_mtd_partitions(mtd, probes, + &drvdata->parts[i], 0); + if (nr_parts > 0) + add_mtd_partitions(mtd, drvdata->parts[i], nr_parts); +#endif + add_mtd_device(mtd); + drvdata->mtds[i] = mtd; + } + + return 0; +} + +static int __exit txx9ndfmc_remove(struct platform_device *dev) +{ + struct txx9ndfmc_drvdata *drvdata = platform_get_drvdata(dev); + int i; + + platform_set_drvdata(dev, NULL); + if (!drvdata) + return 0; + for (i = 0; i < MAX_TXX9NDFMC_DEV; i++) { + struct mtd_info *mtd = drvdata->mtds[i]; + struct nand_chip *chip; + struct txx9ndfmc_priv *txx9_priv; + + if (!mtd) + continue; + chip = mtd->priv; + txx9_priv = chip->priv; + +#ifdef CONFIG_MTD_PARTITIONS + del_mtd_partitions(mtd); + kfree(drvdata->parts[i]); +#endif + del_mtd_device(mtd); + kfree(txx9_priv); + } + return 0; +} + +#ifdef CONFIG_PM +static int txx9ndfmc_resume(struct platform_device *dev) +{ + if (platform_get_drvdata(dev)) + txx9ndfmc_initialize(dev); + return 0; +} +#else +#define txx9ndfmc_resume NULL +#endif + +static struct platform_driver txx9ndfmc_driver = { + .remove = __exit_p(txx9ndfmc_remove), + .resume = txx9ndfmc_resume, + .driver = { + .name = "txx9ndfmc", + .owner = THIS_MODULE, + }, +}; + +static int __init txx9ndfmc_init(void) +{ + return platform_driver_probe(&txx9ndfmc_driver, txx9ndfmc_probe); +} + +static void __exit txx9ndfmc_exit(void) +{ + platform_driver_unregister(&txx9ndfmc_driver); +} + +module_init(txx9ndfmc_init); +module_exit(txx9ndfmc_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TXx9 SoC NAND flash controller driver"); +MODULE_ALIAS("platform:txx9ndfmc"); -- cgit v1.2.3-59-g8ed1b From 610f75e74b636f933bc3e379a88a10f883b91332 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Wed, 4 Mar 2009 12:01:34 -0800 Subject: [MTD] RBTX4939 map driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a map driver for NOR flash chips on RBTX4939 board. Signed-off-by: Atsushi Nemoto Cc: Ralf Bächle Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- drivers/mtd/maps/Kconfig | 6 ++ drivers/mtd/maps/Makefile | 1 + drivers/mtd/maps/rbtx4939-flash.c | 208 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 drivers/mtd/maps/rbtx4939-flash.c diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index 043d50fb6ef6..a0da8e243cc4 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -542,6 +542,12 @@ config MTD_INTEL_VR_NOR Map driver for a NOR flash bank located on the Expansion Bus of the Intel Vermilion Range chipset. +config MTD_RBTX4939 + tristate "Map driver for RBTX4939 board" + depends on TOSHIBA_RBTX4939 && MTD_CFI && MTD_COMPLEX_MAPPINGS + help + Map driver for NOR flash chips on RBTX4939 board. + config MTD_PLATRAM tristate "Map driver for platform device RAM (mtd-ram)" select MTD_RAM diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile index 6d9ba35caf11..f53a7dc4a4fb 100644 --- a/drivers/mtd/maps/Makefile +++ b/drivers/mtd/maps/Makefile @@ -61,3 +61,4 @@ obj-$(CONFIG_MTD_PLATRAM) += plat-ram.o obj-$(CONFIG_MTD_OMAP_NOR) += omap_nor.o obj-$(CONFIG_MTD_INTEL_VR_NOR) += intel_vr_nor.o obj-$(CONFIG_MTD_BFIN_ASYNC) += bfin-async-flash.o +obj-$(CONFIG_MTD_RBTX4939) += rbtx4939-flash.o diff --git a/drivers/mtd/maps/rbtx4939-flash.c b/drivers/mtd/maps/rbtx4939-flash.c new file mode 100644 index 000000000000..d39f0adac846 --- /dev/null +++ b/drivers/mtd/maps/rbtx4939-flash.c @@ -0,0 +1,208 @@ +/* + * rbtx4939-flash (based on physmap.c) + * + * This is a simplified physmap driver with map_init callback function. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Copyright (C) 2009 Atsushi Nemoto + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct rbtx4939_flash_info { + struct mtd_info *mtd; + struct map_info map; +#ifdef CONFIG_MTD_PARTITIONS + int nr_parts; + struct mtd_partition *parts; +#endif +}; + +static int rbtx4939_flash_remove(struct platform_device *dev) +{ + struct rbtx4939_flash_info *info; + + info = platform_get_drvdata(dev); + if (!info) + return 0; + platform_set_drvdata(dev, NULL); + + if (info->mtd) { +#ifdef CONFIG_MTD_PARTITIONS + struct rbtx4939_flash_data *pdata = dev->dev.platform_data; + + if (info->nr_parts) { + del_mtd_partitions(info->mtd); + kfree(info->parts); + } else if (pdata->nr_parts) + del_mtd_partitions(info->mtd); + else + del_mtd_device(info->mtd); +#else + del_mtd_device(info->mtd); +#endif + map_destroy(info->mtd); + } + return 0; +} + +static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", NULL }; +#ifdef CONFIG_MTD_PARTITIONS +static const char *part_probe_types[] = { "cmdlinepart", NULL }; +#endif + +static int rbtx4939_flash_probe(struct platform_device *dev) +{ + struct rbtx4939_flash_data *pdata; + struct rbtx4939_flash_info *info; + struct resource *res; + const char **probe_type; + int err = 0; + unsigned long size; + + pdata = dev->dev.platform_data; + if (!pdata) + return -ENODEV; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) + return -ENODEV; + info = devm_kzalloc(&dev->dev, sizeof(struct rbtx4939_flash_info), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + platform_set_drvdata(dev, info); + + size = resource_size(res); + pr_notice("rbtx4939 platform flash device: %pR\n", res); + + if (!devm_request_mem_region(&dev->dev, res->start, size, + dev_name(&dev->dev))) + return -EBUSY; + + info->map.name = dev_name(&dev->dev); + info->map.phys = res->start; + info->map.size = size; + info->map.bankwidth = pdata->width; + + info->map.virt = devm_ioremap(&dev->dev, info->map.phys, size); + if (!info->map.virt) + return -EBUSY; + + if (pdata->map_init) + (*pdata->map_init)(&info->map); + else + simple_map_init(&info->map); + + probe_type = rom_probe_types; + for (; !info->mtd && *probe_type; probe_type++) + info->mtd = do_map_probe(*probe_type, &info->map); + if (!info->mtd) { + dev_err(&dev->dev, "map_probe failed\n"); + err = -ENXIO; + goto err_out; + } + info->mtd->owner = THIS_MODULE; + if (err) + goto err_out; + +#ifdef CONFIG_MTD_PARTITIONS + err = parse_mtd_partitions(info->mtd, part_probe_types, + &info->parts, 0); + if (err > 0) { + add_mtd_partitions(info->mtd, info->parts, err); + info->nr_parts = err; + return 0; + } + + if (pdata->nr_parts) { + pr_notice("Using rbtx4939 partition information\n"); + add_mtd_partitions(info->mtd, pdata->parts, pdata->nr_parts); + return 0; + } +#endif + + add_mtd_device(info->mtd); + return 0; + +err_out: + rbtx4939_flash_remove(dev); + return err; +} + +#ifdef CONFIG_PM +static int rbtx4939_flash_suspend(struct platform_device *dev, + pm_message_t state) +{ + struct rbtx4939_flash_info *info = platform_get_drvdata(dev); + + if (info->mtd->suspend) + return info->mtd->suspend(info->mtd); + return 0; +} + +static int rbtx4939_flash_resume(struct platform_device *dev) +{ + struct rbtx4939_flash_info *info = platform_get_drvdata(dev); + + if (info->mtd->resume) + info->mtd->resume(info->mtd); + return 0; +} + +static void rbtx4939_flash_shutdown(struct platform_device *dev) +{ + struct rbtx4939_flash_info *info = platform_get_drvdata(dev); + + if (info->mtd->suspend && info->mtd->resume) + if (info->mtd->suspend(info->mtd) == 0) + info->mtd->resume(info->mtd); +} +#else +#define rbtx4939_flash_suspend NULL +#define rbtx4939_flash_resume NULL +#define rbtx4939_flash_shutdown NULL +#endif + +static struct platform_driver rbtx4939_flash_driver = { + .probe = rbtx4939_flash_probe, + .remove = rbtx4939_flash_remove, + .suspend = rbtx4939_flash_suspend, + .resume = rbtx4939_flash_resume, + .shutdown = rbtx4939_flash_shutdown, + .driver = { + .name = "rbtx4939-flash", + .owner = THIS_MODULE, + }, +}; + +static int __init rbtx4939_flash_init(void) +{ + return platform_driver_register(&rbtx4939_flash_driver); +} + +static void __exit rbtx4939_flash_exit(void) +{ + platform_driver_unregister(&rbtx4939_flash_driver); +} + +module_init(rbtx4939_flash_init); +module_exit(rbtx4939_flash_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RBTX4939 MTD map driver"); +MODULE_ALIAS("platform:rbtx4939-flash"); -- cgit v1.2.3-59-g8ed1b From 6ac15e92dfd3064ae484e2e823890622476c4356 Mon Sep 17 00:00:00 2001 From: Graff Yang Date: Mon, 2 Mar 2009 16:31:29 +0800 Subject: [MTD] [CHIPS] cfi_cmdset_0001.c: Fix a bug in inval_cache_and_wait_for_operation(). If the inval_cache_and_wait_for_operation() is re-entered by write operation when erase operation is in progress, the chip->erase_suspended will be cleared, this cause the erase timeo is not reset and will result time out error for erase. Signed-off-by: Graff Yang Signed-off-by: David Woodhouse --- drivers/mtd/chips/cfi_cmdset_0001.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index f5ab6fa1057b..c240454fd113 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -1236,10 +1236,14 @@ static int inval_cache_and_wait_for_operation( remove_wait_queue(&chip->wq, &wait); spin_lock(chip->mutex); } - if (chip->erase_suspended || chip->write_suspended) { - /* Suspend has occured while sleep: reset timeout */ + if (chip->erase_suspended && chip_state == FL_ERASING) { + /* Erase suspend occured while sleep: reset timeout */ timeo = reset_timeo; chip->erase_suspended = 0; + } + if (chip->write_suspended && chip_state == FL_WRITING) { + /* Write suspend occured while sleep: reset timeout */ + timeo = reset_timeo; chip->write_suspended = 0; } } -- cgit v1.2.3-59-g8ed1b From 90160e13b0bfe82d8bfe83541e3aa3961fdeed25 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Mon, 2 Mar 2009 18:42:39 +0000 Subject: [MTD] Auto-load mtdchar module when device opened. The mtdchar module is missing the char-major-90-* alias that would cause it to be auto-loaded when a device of that type is opened. This patch adds the alia.. Signed-off-by: Scott James Remnant Signed-off-by: Tim Gardner Signed-off-by: David Woodhouse --- drivers/mtd/mtdchar.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index e9ec59e9a566..2c8a16f49ca2 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -825,3 +825,4 @@ module_exit(cleanup_mtdchar); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Woodhouse "); MODULE_DESCRIPTION("Direct character-device access to MTD devices"); +MODULE_ALIAS_CHARDEV_MAJOR(MTD_CHAR_MAJOR); -- cgit v1.2.3-59-g8ed1b From e7f521636a2d6b1a012f98f6ec16898c5d6f1543 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Mon, 2 Mar 2009 17:43:04 +0000 Subject: [MTD] Auto-load nftl module when device opened. The nftl module is missing the block-major-93-* alias that would cause it to be auto-loaded when a nftl of that type is opened. This patch adds the alias. Signed-off-by: Scott James Remnant Signed-off-by: Tim Gardner Signed-off-by: David Woodhouse --- drivers/mtd/nftlcore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c index f5bcd4969462..e3f8495a94c2 100644 --- a/drivers/mtd/nftlcore.c +++ b/drivers/mtd/nftlcore.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -817,3 +818,4 @@ module_exit(cleanup_nftl); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Woodhouse , Fabrice Bellard et al."); MODULE_DESCRIPTION("Support code for NAND Flash Translation Layer, used on M-Systems DiskOnChip 2000 and Millennium"); +MODULE_ALIAS_BLOCKDEV_MAJOR(NFTL_MAJOR); -- cgit v1.2.3-59-g8ed1b From b2ed3680553b451e5c45064de26ea8fa5201c6d4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 17 Feb 2009 13:54:45 +0200 Subject: [MTD] [NAND] pxa3xx_nand: use resource_size instead of 'r->end - r->start + 1' Signed-off-by: Mike Rapoport Signed-off-by: David Woodhouse --- drivers/mtd/nand/pxa3xx_nand.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index cc55cbc2b308..ffa960baa7e7 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -1118,14 +1118,14 @@ static int pxa3xx_nand_probe(struct platform_device *pdev) goto fail_put_clk; } - r = request_mem_region(r->start, r->end - r->start + 1, pdev->name); + r = request_mem_region(r->start, resource_size(r), pdev->name); if (r == NULL) { dev_err(&pdev->dev, "failed to request memory resource\n"); ret = -EBUSY; goto fail_put_clk; } - info->mmio_base = ioremap(r->start, r->end - r->start + 1); + info->mmio_base = ioremap(r->start, resource_size(r)); if (info->mmio_base == NULL) { dev_err(&pdev->dev, "ioremap() failed\n"); ret = -ENODEV; @@ -1174,7 +1174,7 @@ fail_free_buf: fail_free_io: iounmap(info->mmio_base); fail_free_res: - release_mem_region(r->start, r->end - r->start + 1); + release_mem_region(r->start, resource_size(r)); fail_put_clk: clk_disable(info->clk); clk_put(info->clk); -- cgit v1.2.3-59-g8ed1b From 82a72d108b4fbcc8f651b7c4e34c6f18a605d58d Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 17 Feb 2009 13:54:46 +0200 Subject: [MTD] [NAND] pxa3xx_nand: allow building as module Signed-off-by: Mike Rapoport Signed-off-by: David Woodhouse --- drivers/mtd/nand/Kconfig | 2 +- drivers/mtd/nand/pxa3xx_nand.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 68ae144ce3be..4e7073954e53 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -334,7 +334,7 @@ config MTD_NAND_ATMEL_ECC_NONE endchoice config MTD_NAND_PXA3xx - bool "Support for NAND flash devices on PXA3xx" + tristate "Support for NAND flash devices on PXA3xx" depends on MTD_NAND && PXA3xx help This enables the driver for the NAND flash device found on diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index ffa960baa7e7..ead4a7a72d44 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -1079,6 +1079,7 @@ static int pxa3xx_nand_probe(struct platform_device *pdev) this = &info->nand_chip; mtd->priv = info; + mtd->owner = THIS_MODULE; info->clk = clk_get(&pdev->dev, NULL); if (IS_ERR(info->clk)) { @@ -1187,6 +1188,7 @@ static int pxa3xx_nand_remove(struct platform_device *pdev) { struct mtd_info *mtd = platform_get_drvdata(pdev); struct pxa3xx_nand_info *info = mtd->priv; + struct resource *r; platform_set_drvdata(pdev, NULL); @@ -1199,6 +1201,14 @@ static int pxa3xx_nand_remove(struct platform_device *pdev) info->data_buff, info->data_buff_phys); } else kfree(info->data_buff); + + iounmap(info->mmio_base); + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + release_mem_region(r->start, resource_size(r)); + + clk_disable(info->clk); + clk_put(info->clk); + kfree(mtd); return 0; } -- cgit v1.2.3-59-g8ed1b From f271049e2010b918f83dc1c7bbd5d75f4710506a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 17 Feb 2009 13:54:47 +0200 Subject: [MTD] [NAND] pxa3xx_nand: add ability to keep controller settings defined by OBM/bootloader Signed-off-by: Mike Rapoport Acked-by: Eric Miao Signed-off-by: David Woodhouse --- arch/arm/mach-pxa/include/mach/pxa3xx_nand.h | 3 + drivers/mtd/nand/pxa3xx_nand.c | 103 ++++++++++++++++++++++++++- 2 files changed, 105 insertions(+), 1 deletion(-) diff --git a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h index eb35fca9aea5..3478eae32d8a 100644 --- a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h +++ b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h @@ -49,6 +49,9 @@ struct pxa3xx_nand_platform_data { */ int enable_arbiter; + /* allow platform code to keep OBM/bootloader defined NFC config */ + int keep_config; + const struct mtd_partition *parts; unsigned int nr_parts; diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index ead4a7a72d44..2857a6a37b5c 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -171,7 +171,13 @@ static int use_dma = 1; module_param(use_dma, bool, 0444); MODULE_PARM_DESC(use_dma, "enable DMA for data transfering to/from NAND HW"); -#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN +/* + * Default NAND flash controller configuration setup by the + * bootloader. This configuration is used only when pdata->keep_config is set + */ +static struct pxa3xx_nand_timing default_timing; +static struct pxa3xx_nand_flash default_flash; + static struct pxa3xx_nand_cmdset smallpage_cmdset = { .read1 = 0x0000, .read2 = 0x0050, @@ -198,6 +204,7 @@ static struct pxa3xx_nand_cmdset largepage_cmdset = { .lock_status = 0x007A, }; +#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN static struct pxa3xx_nand_timing samsung512MbX16_timing = { .tCH = 10, .tCS = 0, @@ -297,9 +304,23 @@ static struct pxa3xx_nand_flash *builtin_flash_types[] = { #define NDTR1_tWHR(c) (min((c), 15) << 4) #define NDTR1_tAR(c) (min((c), 15) << 0) +#define tCH_NDTR0(r) (((r) >> 19) & 0x7) +#define tCS_NDTR0(r) (((r) >> 16) & 0x7) +#define tWH_NDTR0(r) (((r) >> 11) & 0x7) +#define tWP_NDTR0(r) (((r) >> 8) & 0x7) +#define tRH_NDTR0(r) (((r) >> 3) & 0x7) +#define tRP_NDTR0(r) (((r) >> 0) & 0x7) + +#define tR_NDTR1(r) (((r) >> 16) & 0xffff) +#define tWHR_NDTR1(r) (((r) >> 4) & 0xf) +#define tAR_NDTR1(r) (((r) >> 0) & 0xf) + /* convert nano-seconds to nand flash controller clock cycles */ #define ns2cycle(ns, clk) (int)(((ns) * (clk / 1000000) / 1000) - 1) +/* convert nand flash controller clock cycles to nano-seconds */ +#define cycle2ns(c, clk) ((((c) + 1) * 1000000 + clk / 500) / (clk / 1000)) + static void pxa3xx_nand_set_timing(struct pxa3xx_nand_info *info, const struct pxa3xx_nand_timing *t) { @@ -921,6 +942,82 @@ static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info, return 0; } +static void pxa3xx_nand_detect_timing(struct pxa3xx_nand_info *info, + struct pxa3xx_nand_timing *t) +{ + unsigned long nand_clk = clk_get_rate(info->clk); + uint32_t ndtr0 = nand_readl(info, NDTR0CS0); + uint32_t ndtr1 = nand_readl(info, NDTR1CS0); + + t->tCH = cycle2ns(tCH_NDTR0(ndtr0), nand_clk); + t->tCS = cycle2ns(tCS_NDTR0(ndtr0), nand_clk); + t->tWH = cycle2ns(tWH_NDTR0(ndtr0), nand_clk); + t->tWP = cycle2ns(tWP_NDTR0(ndtr0), nand_clk); + t->tRH = cycle2ns(tRH_NDTR0(ndtr0), nand_clk); + t->tRP = cycle2ns(tRP_NDTR0(ndtr0), nand_clk); + + t->tR = cycle2ns(tR_NDTR1(ndtr1), nand_clk); + t->tWHR = cycle2ns(tWHR_NDTR1(ndtr1), nand_clk); + t->tAR = cycle2ns(tAR_NDTR1(ndtr1), nand_clk); +} + +static int pxa3xx_nand_detect_config(struct pxa3xx_nand_info *info) +{ + uint32_t ndcr = nand_readl(info, NDCR); + struct nand_flash_dev *type = NULL; + uint32_t id = -1; + int i; + + default_flash.page_per_block = ndcr & NDCR_PG_PER_BLK ? 64 : 32; + default_flash.page_size = ndcr & NDCR_PAGE_SZ ? 2048 : 512; + default_flash.flash_width = ndcr & NDCR_DWIDTH_M ? 16 : 8; + default_flash.dfc_width = ndcr & NDCR_DWIDTH_C ? 16 : 8; + + if (default_flash.page_size == 2048) + default_flash.cmdset = &largepage_cmdset; + else + default_flash.cmdset = &smallpage_cmdset; + + /* set info fields needed to __readid */ + info->flash_info = &default_flash; + info->read_id_bytes = (default_flash.page_size == 2048) ? 4 : 2; + info->reg_ndcr = ndcr; + + if (__readid(info, &id)) + return -ENODEV; + + /* Lookup the flash id */ + id = (id >> 8) & 0xff; /* device id is byte 2 */ + for (i = 0; nand_flash_ids[i].name != NULL; i++) { + if (id == nand_flash_ids[i].id) { + type = &nand_flash_ids[i]; + break; + } + } + + if (!type) + return -ENODEV; + + /* fill the missing flash information */ + i = __ffs(default_flash.page_per_block * default_flash.page_size); + default_flash.num_blocks = type->chipsize << (20 - i); + + info->oob_size = (default_flash.page_size == 2048) ? 64 : 16; + + /* calculate addressing information */ + info->col_addr_cycles = (default_flash.page_size == 2048) ? 2 : 1; + + if (default_flash.num_blocks * default_flash.page_per_block > 65536) + info->row_addr_cycles = 3; + else + info->row_addr_cycles = 2; + + pxa3xx_nand_detect_timing(info, &default_timing); + default_flash.timing = &default_timing; + + return 0; +} + static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info, const struct pxa3xx_nand_platform_data *pdata) { @@ -928,6 +1025,10 @@ static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info, uint32_t id = -1; int i; + if (pdata->keep_config) + if (pxa3xx_nand_detect_config(info) == 0) + return 0; + for (i = 0; inum_flash; ++i) { f = pdata->flash + i; -- cgit v1.2.3-59-g8ed1b From e2a0f25b4f520adbd82c0caafcde0470ed11053d Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 16 Feb 2009 18:21:35 +0200 Subject: [MTD] mtdoops: allow MTD selection by name MTD's have both an index number and a name. Formerly, the MTD selected for mtdoops was done only by index number. With this patch, a name can be used instead. For example, the kernel command line: console=ttyMTD5 selects MTD 5 for mtdoops. But now this is also possible: console=ttyMTD,log which selects the MTD named "log" for mtdoops. This has the advantage that partitions can be added or removed that would affect the MTD index number but not the name, without having to then change the kernel command line. Signed-off-by: Adrian Hunter Acked-by: Richard Purdie Acked-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/mtdoops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index 1a6b3beabe8d..fdf504fb319a 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -44,6 +44,7 @@ static struct mtdoops_context { int oops_pages; int nextpage; int nextcount; + char *name; void *oops_buf; @@ -273,6 +274,9 @@ static void mtdoops_notify_add(struct mtd_info *mtd) { struct mtdoops_context *cxt = &oops_cxt; + if (cxt->name && !strcmp(mtd->name, cxt->name)) + cxt->mtd_index = mtd->index; + if ((mtd->index != cxt->mtd_index) || cxt->mtd_index < 0) return; @@ -383,8 +387,12 @@ static int __init mtdoops_console_setup(struct console *co, char *options) { struct mtdoops_context *cxt = co->data; - if (cxt->mtd_index != -1) + if (cxt->mtd_index != -1 || cxt->name) return -EBUSY; + if (options) { + cxt->name = kstrdup(options, GFP_KERNEL); + return 0; + } if (co->index == -1) return -EINVAL; @@ -432,6 +440,7 @@ static void __exit mtdoops_console_exit(void) unregister_mtd_user(&mtdoops_notifier); unregister_console(&mtdoops_console); + kfree(cxt->name); vfree(cxt->oops_buf); } -- cgit v1.2.3-59-g8ed1b From 48ec00ac895074f8a47bda8f3925ccaa46abb7a8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 4 Mar 2009 09:53:40 +0200 Subject: [MTD] mtdoops: fix a bit of spin lock usage - do not leave spin lock locked - initialise spin lock Signed-off-by: Adrian Hunter Acked-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- drivers/mtd/mtdoops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index fdf504fb319a..1060337c06df 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -361,8 +361,10 @@ mtdoops_console_write(struct console *co, const char *s, unsigned int count) spin_lock_irqsave(&cxt->writecount_lock, flags); /* Check ready status didn't change whilst waiting for the lock */ - if (!cxt->ready) + if (!cxt->ready) { + spin_unlock_irqrestore(&cxt->writecount_lock, flags); return; + } if (cxt->writecount == 0) { u32 *stamp = cxt->oops_buf; @@ -420,6 +422,7 @@ static int __init mtdoops_console_init(void) cxt->mtd_index = -1; cxt->oops_buf = vmalloc(OOPS_PAGE_SIZE); + spin_lock_init(&cxt->writecount_lock); if (!cxt->oops_buf) { printk(KERN_ERR "Failed to allocate mtdoops buffer workspace\n"); -- cgit v1.2.3-59-g8ed1b From bd50a0ffca0bbb9baa60ab1ef2c09cf7561e1223 Mon Sep 17 00:00:00 2001 From: Yegor Yefremov Date: Fri, 20 Mar 2009 18:50:26 +0000 Subject: [MTD] [CHIPS] Add JEDEC probe support for the SST 39VF3201 flash chip Signed-off-by: Andrew Morton Signed-off-by: Yegor Yefremov Signed-off-by: David Woodhouse --- drivers/mtd/chips/jedec_probe.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/mtd/chips/jedec_probe.c b/drivers/mtd/chips/jedec_probe.c index 2f3f2f719ba4..e824b9b9b056 100644 --- a/drivers/mtd/chips/jedec_probe.c +++ b/drivers/mtd/chips/jedec_probe.c @@ -159,6 +159,7 @@ #define SST39LF800 0x2781 #define SST39LF160 0x2782 #define SST39VF1601 0x234b +#define SST39VF3201 0x235b #define SST39LF512 0x00D4 #define SST39LF010 0x00D5 #define SST39LF020 0x00D6 @@ -1489,6 +1490,21 @@ static const struct amd_flash_info jedec_table[] = { ERASEINFO(0x1000,256), ERASEINFO(0x1000,256) } + }, { + .mfr_id = MANUFACTURER_SST, /* should be CFI */ + .dev_id = SST39VF3201, + .name = "SST 39VF3201", + .devtypes = CFI_DEVICETYPE_X16, + .uaddr = MTD_UADDR_0xAAAA_0x5555, + .dev_size = SIZE_4MiB, + .cmd_set = P_ID_AMD_STD, + .nr_regions = 4, + .regions = { + ERASEINFO(0x1000,256), + ERASEINFO(0x1000,256), + ERASEINFO(0x1000,256), + ERASEINFO(0x1000,256) + } }, { .mfr_id = MANUFACTURER_SST, .dev_id = SST36VF3203, -- cgit v1.2.3-59-g8ed1b From 17b536cc43bcf2afcc20b4812f93a895881b5f4f Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Fri, 6 Mar 2009 20:01:08 +0900 Subject: [MTD] mtdpart: Make all partition parsers return allocated array Currently redboot and afx parser return allocated mtd_partition array and cmdlinepart and ar7 return persistent array. This patch make cmdlinepart and ar7 also return allocated array, so that all users can free it regardless of parser type. Signed-off-by: Atsushi Nemoto Signed-off-by: David Woodhouse --- drivers/mtd/ar7part.c | 6 ++++-- drivers/mtd/cmdlinepart.c | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ar7part.c b/drivers/mtd/ar7part.c index ecf170b55c32..6697a1ec72d0 100644 --- a/drivers/mtd/ar7part.c +++ b/drivers/mtd/ar7part.c @@ -44,8 +44,6 @@ struct ar7_bin_rec { unsigned int address; }; -static struct mtd_partition ar7_parts[AR7_PARTS]; - static int create_mtd_partitions(struct mtd_info *master, struct mtd_partition **pparts, unsigned long origin) @@ -57,7 +55,11 @@ static int create_mtd_partitions(struct mtd_info *master, unsigned int root_offset = ROOT_OFFSET; int retries = 10; + struct mtd_partition *ar7_parts; + ar7_parts = kzalloc(sizeof(*ar7_parts) * AR7_PARTS, GFP_KERNEL); + if (!ar7_parts) + return -ENOMEM; ar7_parts[0].name = "loader"; ar7_parts[0].offset = 0; ar7_parts[0].size = master->erasesize; diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c index 50a340388e74..5011fa73f918 100644 --- a/drivers/mtd/cmdlinepart.c +++ b/drivers/mtd/cmdlinepart.c @@ -335,7 +335,11 @@ static int parse_cmdline_partitions(struct mtd_info *master, } offset += part->parts[i].size; } - *pparts = part->parts; + *pparts = kmemdup(part->parts, + sizeof(*part->parts) * part->num_parts, + GFP_KERNEL); + if (!*pparts) + return -ENOMEM; return part->num_parts; } } -- cgit v1.2.3-59-g8ed1b From 09c9e84d474d917d9de5b9011ed2064b03a19677 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 21 Mar 2009 04:33:36 +0100 Subject: tracing/ring-buffer: don't annotate rb_cpu_notify with __cpuinit Impact: remove a section warning CONFIG_DEBUG_SECTION_MISMATCH raises the following warning on -tip: WARNING: kernel/trace/built-in.o(.text+0x5bc5): Section mismatch in reference from the function ring_buffer_alloc() to the function .cpuinit.text:rb_cpu_notify() The function ring_buffer_alloc() references the function __cpuinit rb_cpu_notify(). This is actually harmless. The code in the ring buffer don't build rb_cpu_notify and other cpu hotplug stuffs when !CONFIG_HOTPLUG_CPU so we have no risk to reference freed memory here (it would even be harmless if we unconditionally build it because register_cpu_notifier would do nothing when !CONFIG_HOTPLUG_CPU. But since ring_buffer_alloc() can be called everytime, we don't want it to be annotated with __cpuinit so we drop the __cpuinit from rb_cpu_notify. This is not a waste of memory because it is only defined and used on CONFIG_HOTPLUG_CPU. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237606416-22268-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 384ca5d9d729..808b14bbf076 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -535,8 +535,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) extern int ring_buffer_page_too_big(void); #ifdef CONFIG_HOTPLUG_CPU -static int __cpuinit rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); #endif /** @@ -2784,8 +2784,8 @@ static __init int rb_init_debugfs(void) fs_initcall(rb_init_debugfs); #ifdef CONFIG_HOTPLUG_CPU -static int __cpuinit rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); -- cgit v1.2.3-59-g8ed1b From 1a17662ea033674a58bad3603531b0b5d42572f6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:47:30 +0800 Subject: blktrace: fix possible memory leak When we failed to create "block" debugfs dir, we should do some cleanups. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5B2.8000800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b171778e3863..fb3bc53835dd 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -432,7 +432,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!blk_tree_root) { blk_tree_root = debugfs_create_dir("block", NULL); if (!blk_tree_root) - return -ENOMEM; + goto err; } dir = debugfs_create_dir(buts->name, blk_tree_root); -- cgit v1.2.3-59-g8ed1b From 5006ea73f38caef6065d1136808413813271633f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:03 +0800 Subject: blktrace: make blk_tracer_enabled a bool flag It doesn't have to be a counter, and it can be a bool flag instead. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C2F5D3.8090104@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb3bc53835dd..73845b7968bb 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -30,7 +30,7 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; -static int __read_mostly blk_tracer_enabled; +static bool blk_tracer_enabled __read_mostly; /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -1111,9 +1111,7 @@ static int blk_tracer_init(struct trace_array *tr) { blk_tr = tr; blk_tracer_start(tr); - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled++; - mutex_unlock(&blk_probe_mutex); + blk_tracer_enabled = true; return 0; } @@ -1131,11 +1129,7 @@ static void blk_tracer_reset(struct trace_array *tr) if (!atomic_read(&blk_probes_ref)) return; - mutex_lock(&blk_probe_mutex); - blk_tracer_enabled--; - WARN_ON(blk_tracer_enabled < 0); - mutex_unlock(&blk_probe_mutex); - + blk_tracer_enabled = false; blk_tracer_stop(tr); } -- cgit v1.2.3-59-g8ed1b From 3c289ba7c320560ee74979a8895141c829046a2d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:26 +0800 Subject: blktrace: remove blk_probe_mutex blk_register_tracepoints() always returns 0, so make it return void, thus we don't need to use blk_probe_mutex to protect blk_probes_ref. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5EA.8060606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 73845b7968bb..223b92e77b3f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); static atomic_t blk_probes_ref = ATOMIC_INIT(0); -static int blk_register_tracepoints(void); +static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); /* @@ -256,10 +255,8 @@ static void blk_trace_cleanup(struct blk_trace *bt) free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); - mutex_lock(&blk_probe_mutex); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } int blk_trace_remove(struct request_queue *q) @@ -471,13 +468,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); + if (atomic_add_return(1, &blk_probes_ref) == 1) + blk_register_tracepoints(); ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); @@ -487,9 +479,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); err: if (bt) { if (bt->msg_file) @@ -863,7 +852,7 @@ void blk_add_driver_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_add_driver_data); -static int blk_register_tracepoints(void) +static void blk_register_tracepoints(void) { int ret; @@ -901,7 +890,6 @@ static int blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); - return 0; } static void blk_unregister_tracepoints(void) @@ -1099,11 +1087,8 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { - mutex_lock(&blk_probe_mutex); if (atomic_add_return(1, &blk_probes_ref) == 1) - if (blk_register_tracepoints()) - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); + blk_register_tracepoints(); trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } @@ -1118,10 +1103,8 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { trace_flags |= TRACE_ITER_CONTEXT_INFO; - mutex_lock(&blk_probe_mutex); if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); } static void blk_tracer_reset(struct trace_array *tr) -- cgit v1.2.3-59-g8ed1b From cbe28296eb1ac441b35cf45804d0ae808add7dd1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:48:47 +0800 Subject: blktrace: don't increase blk_probes_ref if failed to setup blk trace do_blk_trace_setup() may return EBUSY, but the current code doesn't decrease blk_probes_ref in this case. Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F5FF.80002@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 223b92e77b3f..11e7c8d9d222 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -468,9 +468,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->pid = buts->pid; bt->trace_state = Blktrace_setup; - if (atomic_add_return(1, &blk_probes_ref) == 1) - blk_register_tracepoints(); - ret = -EBUSY; old_bt = xchg(&q->blk_trace, bt); if (old_bt) { @@ -478,6 +475,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, goto err; } + if (atomic_add_return(1, &blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; err: if (bt) { -- cgit v1.2.3-59-g8ed1b From 15152e448b693fa41de40f1e40ffbe717a3aab88 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 09:49:08 +0800 Subject: blktrace: report EBUSY correctly blk_trace_remove_queue() returns EINVAL if q->blk_trace == NULL, but blk_trace_setup_queue() doesn't return EBUSY if q->blk_trace != NULL. # echo 0 > sdaX/trace/enable # echo 0 > sdaX/trace/enable bash: echo: write error: Invalid argument # echo 1 > sdaX/trace/enable # echo 1 > sdaX/trace/enable (should return EBUSY) Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Acked-by: Frederic Weisbecker Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C2F614.2010101@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11e7c8d9d222..14986afdbc1c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1260,12 +1260,10 @@ static int blk_trace_remove_queue(struct request_queue *q) static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) { struct blk_trace *old_bt, *bt = NULL; - int ret; - ret = -ENOMEM; bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - goto err; + return -ENOMEM; bt->dev = dev; bt->act_mask = (u16)-1; @@ -1276,11 +1274,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (old_bt != NULL) { (void)xchg(&q->blk_trace, old_bt); kfree(bt); - ret = -EBUSY; + return -EBUSY; } + return 0; -err: - return ret; } /* -- cgit v1.2.3-59-g8ed1b From cd649b8bb830d65c57c3c8b98d57b5402256d8bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 11:33:55 +0800 Subject: blktrace: remove sysfs_blk_trace_enable_show/store() sysfs_blk_trace_enable_show()/store() share most of code with sysfs_blk_trace_attr_show()/store(). Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C30EA3.1060004@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 92 ++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 70 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 14986afdbc1c..dfee6f915179 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1284,72 +1284,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) * sysfs interface to enable and configure tracing */ -static ssize_t sysfs_blk_trace_enable_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct hd_struct *p = dev_to_part(dev); - struct block_device *bdev; - ssize_t ret = -ENXIO; - - lock_kernel(); - bdev = bdget(part_devt(p)); - if (bdev != NULL) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q != NULL) { - mutex_lock(&bdev->bd_mutex); - ret = sprintf(buf, "%u\n", !!q->blk_trace); - mutex_unlock(&bdev->bd_mutex); - } - - bdput(bdev); - } - - unlock_kernel(); - return ret; -} - -static ssize_t sysfs_blk_trace_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct block_device *bdev; - struct request_queue *q; - struct hd_struct *p; - int value; - ssize_t ret = -ENXIO; - - if (count == 0 || sscanf(buf, "%d", &value) != 1) - goto out; - - lock_kernel(); - p = dev_to_part(dev); - bdev = bdget(part_devt(p)); - if (bdev == NULL) - goto out_unlock_kernel; - - q = bdev_get_queue(bdev); - if (q == NULL) - goto out_bdput; - - mutex_lock(&bdev->bd_mutex); - if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); - else - ret = blk_trace_remove_queue(q); - mutex_unlock(&bdev->bd_mutex); - - if (ret == 0) - ret = count; -out_bdput: - bdput(bdev); -out_unlock_kernel: - unlock_kernel(); -out: - return ret; -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -1361,8 +1295,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, sysfs_blk_trace_attr_show, \ sysfs_blk_trace_attr_store) -static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR, - sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store); +static BLK_TRACE_DEVICE_ATTR(enable); static BLK_TRACE_DEVICE_ATTR(act_mask); static BLK_TRACE_DEVICE_ATTR(pid); static BLK_TRACE_DEVICE_ATTR(start_lba); @@ -1447,6 +1380,12 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!q->blk_trace); + goto out_unlock_bdev; + } + if (q->blk_trace == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) @@ -1457,6 +1396,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); else if (attr == &dev_attr_end_lba) ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); @@ -1499,6 +1440,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out_bdput; mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + if (value) + ret = blk_trace_setup_queue(q, bdev->bd_dev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + ret = 0; if (q->blk_trace == NULL) ret = blk_trace_setup_queue(q, bdev->bd_dev); @@ -1512,13 +1462,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, q->blk_trace->start_lba = value; else if (attr == &dev_attr_end_lba) q->blk_trace->end_lba = value; - ret = count; } + +out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); out_bdput: bdput(bdev); out_unlock_kernel: unlock_kernel(); out: - return ret; + return ret ? ret : count; } + -- cgit v1.2.3-59-g8ed1b From b125130b22d67f249beba10b71a254558b5279d0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Mar 2009 10:34:00 +0800 Subject: blktrace: avoid accessing NULL bdev->bd_disk bdev->bd_disk can be NULL, if the block device is not opened. Try this against an unmounted partition, and you'll see NULL dereference: # echo 1 > /sys/block/sda/sda5/enable Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt LKML-Reference: <49C30098.6080107@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dfee6f915179..108f4f7715a5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1362,6 +1362,14 @@ static int blk_str2act_mask(const char *str) return mask; } +static struct request_queue *blk_trace_get_queue(struct block_device *bdev) +{ + if (bdev->bd_disk == NULL) + return NULL; + + return bdev_get_queue(bdev); +} + static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1376,9 +1384,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; + mutex_lock(&bdev->bd_mutex); if (attr == &dev_attr_enable) { @@ -1435,7 +1444,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bdev == NULL) goto out_unlock_kernel; - q = bdev_get_queue(bdev); + q = blk_trace_get_queue(bdev); if (q == NULL) goto out_bdput; -- cgit v1.2.3-59-g8ed1b From cf586b61f80229491127d3c57c06ed93c9f530d3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 05:04:35 +0100 Subject: tracing/function-graph-tracer: prevent hangs during self-tests Impact: detect tracing related hangs Sometimes, with some configs, the function graph tracer can make the timer interrupt too much slow, hanging the kernel in an endless loop of timer interrupts servicing. As suggested by Ingo, this patch brings a watchdog which stops the selftest after a defined number of functions traced, definitely disabling this tracer. For those who want to debug the cause of the function graph trace hang, you can pass the ftrace_dump_on_oops kernel parameter to dump the traces after this hang detection. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 26 +++++++++++++++++++++++--- kernel/trace/trace_selftest.c | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3dfefe69348..e6fac0ffe6f0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4018,11 +4018,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void ftrace_dump(void) +static void __ftrace_dump(bool disable_tracing) { static DEFINE_SPINLOCK(ftrace_dump_lock); /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + unsigned int old_userobj; static int dump_ran; unsigned long flags; int cnt = 0, cpu; @@ -4034,14 +4035,17 @@ void ftrace_dump(void) dump_ran = 1; - /* No turning back! */ tracing_off(); - ftrace_kill(); + + if (disable_tracing) + ftrace_kill(); for_each_tracing_cpu(cpu) { atomic_inc(&global_trace.data[cpu]->disabled); } + old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; @@ -4086,10 +4090,26 @@ void ftrace_dump(void) else printk(KERN_TRACE "---------------------------------\n"); + /* Re-enable tracing if requested */ + if (!disable_tracing) { + trace_flags |= old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + tracing_on(); + } + out: spin_unlock_irqrestore(&ftrace_dump_lock, flags); } +/* By default: disable tracing after the dump */ +void ftrace_dump(void) +{ + __ftrace_dump(true); +} + __init static int tracer_alloc_buffers(void) { struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 38856ba78a92..b56dcf7d3566 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -248,6 +248,28 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Maximum number of functions to trace before diagnosing a hang */ +#define GRAPH_MAX_FUNC_TEST 100000000 + +static void __ftrace_dump(bool disable_tracing); +static unsigned int graph_hang_thresh; + +/* Wrap the real function entry probe to avoid possible hanging */ +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +{ + /* This is harmlessly racy, we want to approximately detect a hang */ + if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { + ftrace_graph_stop(); + printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); + if (ftrace_dump_on_oops) + __ftrace_dump(false); + return 0; + } + + return trace_graph_entry(trace); +} + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -259,15 +281,29 @@ trace_selftest_startup_function_graph(struct tracer *trace, int ret; unsigned long count; - ret = tracer_init(trace, tr); + /* + * Simulate the init() callback but we attach a watchdog callback + * to detect and recover from possible hangs + */ + tracing_reset_online_cpus(tr); + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry_watchdog); if (ret) { warn_failed_init_tracer(trace, ret); goto out; } + tracing_start_cmdline_record(); /* Sleep for a 1/10 of a second */ msleep(100); + /* Have we just recovered from a hang? */ + if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { + trace->reset(tr); + ret = -1; + goto out; + } + tracing_stop(); /* check the trace buffer */ -- cgit v1.2.3-59-g8ed1b From 0cf53ff62b3e9e491ff5e5f05b193fb6ce643047 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 15:13:07 +0100 Subject: tracing: keep the tracing buffer after self-test failure Instead of using ftrace_dump_on_oops, it's far more convenient to have the trace leading up to a self-test failure available in /debug/tracing/trace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b56dcf7d3566..08f4eb2763d1 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -299,7 +299,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* Have we just recovered from a hang? */ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { - trace->reset(tr); + tracing_selftest_disabled = true; ret = -1; goto out; } -- cgit v1.2.3-59-g8ed1b From cf027f645e6aee4f0ca6197a6b6a57f327fdb13f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:30:39 -0500 Subject: tracing: add run-time field descriptions for event filtering This patch makes the field descriptions defined for event tracing available at run-time, for the event-filtering mechanism introduced in a subsequent patch. The common event fields are prepended with 'common_' in the format display, allowing them to be distinguished from the other fields that might internally have same name and can therefore be unambiguously used in filters. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710639.7703.46.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 30 +++++++++++++++++-------- kernel/trace/trace_events.c | 40 ++++++++++++++++++++++++++++++++- kernel/trace/trace_events_stage_2.h | 45 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_stage_3.h | 2 ++ 4 files changed, 107 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7cfb741be200..9288dc7ad14f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -775,16 +775,26 @@ enum { TRACE_EVENT_TYPE_RAW = 2, }; +struct ftrace_event_field { + struct list_head link; + char *name; + char *type; + int offset; + int size; +}; + struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; @@ -793,6 +803,8 @@ struct ftrace_event_call { #endif }; +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3047b56f6637..961b057da28b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,34 @@ static DEFINE_MUTEX(event_mutex); +int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size) +{ + struct ftrace_event_field *field; + + field = kmalloc(sizeof(*field), GFP_KERNEL); + if (!field) + goto err; + field->name = kstrdup(name, GFP_KERNEL); + if (!field->name) + goto err; + field->type = kstrdup(type, GFP_KERNEL); + if (!field->type) + goto err; + field->offset = offset; + field->size = size; + list_add(&field->link, &call->fields); + + return 0; +err: + if (field) { + kfree(field->name); + kfree(field->type); + } + kfree(field); + return -ENOMEM; +} + static void ftrace_clear_events(void) { struct ftrace_event_call *call = (void *)__start_ftrace_events; @@ -343,7 +371,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, #undef FIELD #define FIELD(type, name) \ - #type, #name, offsetof(typeof(field), name), sizeof(field.name) + #type, "common_" #name, offsetof(typeof(field), name), \ + sizeof(field.name) static int trace_write_header(struct trace_seq *s) { @@ -581,6 +610,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) call->name); } + if (call->define_fields) { + ret = call->define_fields(); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } + } + /* A trace may not want to export its format */ if (!call->show_format) return 0; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 5117c43f5c67..30743f7d4110 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s) \ } #include + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 6b3261ca988c..468938f70141 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -252,6 +252,7 @@ static int ftrace_raw_init_event_##call(void) \ if (!id) \ return -ENODEV; \ event_##call.id = id; \ + INIT_LIST_HEAD(&event_##call.fields); \ return 0; \ } \ \ @@ -264,6 +265,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ _TRACE_PROFILE_INIT(call) \ } -- cgit v1.2.3-59-g8ed1b From f80d2d7725b04f8225b11b55e43bb2c77c819926 Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Sun, 22 Mar 2009 19:11:10 +0200 Subject: tracing, Text Edit Lock: Fix one sparse warning in kernel/extable.c Impact: cleanup. The global mutex text_mutex if declared in linux/memory.h, so this file needs to be included into kernel/extable.c, where the same mutex is defined. This fixes the following sparse warning: kernel/extable.c:32:1: warning: symbol 'text_mutex' was not declared. Should it be static? Signed-off-by: Dmitri Vorobiev LKML-Reference: <1237741871-5827-3-git-send-email-dmitri.vorobiev@movial.com> Signed-off-by: Ingo Molnar --- kernel/extable.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/extable.c b/kernel/extable.c index 25d39b0c3a1b..b54a6017b6b5 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -16,6 +16,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From b8b94265337f83b7db9c5f429b1769d463d7da8c Mon Sep 17 00:00:00 2001 From: Dmitri Vorobiev Date: Sun, 22 Mar 2009 19:11:11 +0200 Subject: tracing: fix four sparse warnings Impact: cleanup. This patch fixes the following sparse warnings: kernel/trace/trace.c:385:9: warning: symbol 'trace_seq_to_buffer' was not declared. Should it be static? kernel/trace/trace_clock.c:29:13: warning: symbol 'trace_clock_local' was not declared. Should it be static? kernel/trace/trace_clock.c:54:13: warning: symbol 'trace_clock' was not declared. Should it be static? kernel/trace/trace_clock.c:74:13: warning: symbol 'trace_clock_global' was not declared. Should it be static? Signed-off-by: Dmitri Vorobiev LKML-Reference: <1237741871-5827-4-git-send-email-dmitri.vorobiev@movial.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- kernel/trace/trace_clock.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6fac0ffe6f0..ace685c70186 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -382,7 +382,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) return cnt; } -ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; void *ret; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 05b176abfd30..b588fd81f7f9 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * trace_clock_local(): the simplest and least coherent tracing clock. -- cgit v1.2.3-59-g8ed1b From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:30:49 -0500 Subject: tracing: add ring_buffer_event_discard() to ring buffer This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard events from the ring buffer, for the event-filtering mechanism introduced in a subsequent patch. I did the initial version but thanks to Steven Rostedt for adding the parts that actually made it work. ;-) Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 11 +++-- kernel/trace/ring_buffer.c | 117 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 105 insertions(+), 23 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 9e6052bd1a1c..e1b7b2173885 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -18,10 +18,13 @@ struct ring_buffer_event { /** * enum ring_buffer_type - internal ring buffer types * - * @RINGBUF_TYPE_PADDING: Left over page padding - * array is ignored - * size is variable depending on how much + * @RINGBUF_TYPE_PADDING: Left over page padding or discarded event + * If time_delta is 0: + * array is ignored + * size is variable depending on how much * padding is needed + * If time_delta is non zero: + * everything else same as RINGBUF_TYPE_DATA * * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta * array[0] = time delta (28 .. 59) @@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } +void ring_buffer_event_discard(struct ring_buffer_event *event); + /* * size is in bytes for each per CPU buffer. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 384ca5d9d729..a09027ec1714 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -189,16 +189,65 @@ enum { RB_LEN_TIME_STAMP = 16, }; -/* inline for ring buffer fast paths */ +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; +} + +static inline int rb_discarded_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING && event->time_delta; +} + +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} + +/** + * ring_buffer_event_discard - discard an event in the ring buffer + * @buffer: the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + static unsigned -rb_event_length(struct ring_buffer_event *event) +rb_event_data_length(struct ring_buffer_event *event) { unsigned length; + if (event->len) + length = event->len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} + +/* inline for ring buffer fast paths */ +static unsigned +rb_event_length(struct ring_buffer_event *event) +{ switch (event->type) { case RINGBUF_TYPE_PADDING: - /* undefined */ - return -1; + if (rb_null_event(event)) + /* undefined */ + return -1; + return rb_event_data_length(event); case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event) return RB_LEN_TIME_STAMP; case RINGBUF_TYPE_DATA: - if (event->len) - length = event->len * RB_ALIGNMENT; - else - length = event->array[0]; - return length + RB_EVNT_HDR_SIZE; + return rb_event_data_length(event); default: BUG(); } @@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); -static inline int rb_null_event(struct ring_buffer_event *event) -{ - return event->type == RINGBUF_TYPE_PADDING; -} - static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { @@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (tail < BUF_PAGE_SIZE) { /* Mark the rest of the page with padding */ event = __rb_page_index(tail_page, tail); - event->type = RINGBUF_TYPE_PADDING; + rb_event_set_padding(event); } if (tail <= BUF_PAGE_SIZE) @@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA) + if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - RB_WARN_ON(cpu_buffer, 1); + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ rb_advance_reader(cpu_buffer); - return NULL; + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) switch (event->type) { case RINGBUF_TYPE_PADDING: - rb_inc_iter(iter); - goto again; + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; case RINGBUF_TYPE_TIME_EXTEND: /* Internal data, OK to advance */ @@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } @@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) struct ring_buffer_event *event = NULL; unsigned long flags; + again: /* might be called in atomic */ preempt_disable(); @@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_consume); @@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; + again: spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); if (!event) @@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + if (event && event->type == RINGBUF_TYPE_PADDING) { + cpu_relax(); + goto again; + } + return event; } EXPORT_SYMBOL_GPL(ring_buffer_read); -- cgit v1.2.3-59-g8ed1b From 7ce7e4249921d5073e764f7ff7ad83cfa9894bd7 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:31:04 -0500 Subject: tracing: add per-event filtering This patch adds per-event filtering to the event tracing subsystem. It adds a 'filter' debugfs file to each event directory. This file can be written to to set filters; reading from it will display the current set of filters set for that event. Basically, any field listed in the 'format' file for an event can be filtered on (including strings, but not yet other array types) using either matching ('==') or non-matching ('!=') 'predicates'. A 'predicate' can be either a single expression: # echo pid != 0 > filter # cat filter pid != 0 or a compound expression of up to 8 sub-expressions combined using '&&' or '||': # echo comm == Xorg > filter # echo "&& sig != 29" > filter # cat filter comm == Xorg && sig != 29 Only events having field values matching an expression will be available in the trace output; non-matching events are discarded. Note that a compound expression is built up by echoing each sub-expression separately - it's not the most efficient way to do things, but it keeps the parser simple and assumes that compound expressions will be relatively uncommon. In any case, a subsequent patch introducing a way to set filters for entire subsystems should mitigate any need to do this for lots of events. Setting a filter without an '&&' or '||' clears the previous filter completely and sets the filter to the new expression: # cat filter comm == Xorg && sig != 29 # echo comm != Xorg # cat filter comm != Xorg To clear a filter, echo 0 to the filter file: # echo 0 > filter # cat filter none The limit of 8 predicates for a compound expression is arbitrary - for efficiency, it's implemented as an array of pointers to predicates, and 8 seemed more than enough for any filter... Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710665.7703.48.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/Makefile | 1 + kernel/trace/trace.h | 28 ++++ kernel/trace/trace_events.c | 77 +++++++++ kernel/trace/trace_events_filter.c | 326 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events_stage_3.h | 4 + 5 files changed, 436 insertions(+) create mode 100644 kernel/trace/trace_events_filter.c diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0e45c206c2f9..2630f5121ec1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACER) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9288dc7ad14f..d9eb39e4bb38 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -795,6 +795,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; @@ -803,8 +804,35 @@ struct ftrace_event_call { #endif }; +#define MAX_FILTER_PRED 8 + +struct filter_pred; + +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + char *str_val; + int str_len; + char *field_name; + int offset; + int not; + int or; + int compound; + int clear; +}; + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern void filter_free_pred(struct filter_pred *pred); +extern int filter_print_preds(struct filter_pred **preds, char *buf); +extern int filter_parse(char **pbuf, struct filter_pred *pred); +extern int filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred); +extern void filter_free_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 961b057da28b..97470c48956e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -459,6 +459,71 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return r; } +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + r = filter_print_preds(call->preds, s->buffer); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_preds(call); + return cnt; + } + + if (filter_add_pred(call, pred)) { + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -504,6 +569,12 @@ static const struct file_operations ftrace_event_id_fops = { .read = event_id_read, }; +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_generic, + .read = event_filter_read, + .write = event_filter_write, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -619,6 +690,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) } } + entry = debugfs_create_file("filter", 0444, call->dir, call, + &ftrace_event_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", call->name); + /* A trace may not want to export its format */ if (!call->show_format) return 0; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 000000000000..8e8c5fa25be9 --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,326 @@ +/* + * trace_events_filter - generic event filtering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2009 Tom Zanussi + */ + +#include +#include +#include +#include + +#include "trace.h" + +static int filter_pred_64(struct filter_pred *pred, void *event) +{ + u64 *addr = (u64 *)(event + pred->offset); + u64 val = (u64)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_32(struct filter_pred *pred, void *event) +{ + u32 *addr = (u32 *)(event + pred->offset); + u32 val = (u32)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_16(struct filter_pred *pred, void *event) +{ + u16 *addr = (u16 *)(event + pred->offset); + u16 val = (u16)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_8(struct filter_pred *pred, void *event) +{ + u8 *addr = (u8 *)(event + pred->offset); + u8 val = (u8)pred->val; + int match; + + match = (val == *addr) ^ pred->not; + + return match; +} + +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct ftrace_event_call *call, void *rec) +{ + int i, matched, and_failed = 0; + struct filter_pred *pred; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; + } else + break; + } + + if (and_failed) + return 0; + + return 1; +} + +int filter_print_preds(struct filter_pred **preds, char *buf) +{ + ssize_t this_len = 0; + char *field_name; + struct filter_pred *pred; + int i; + + if (!preds) { + this_len += sprintf(buf + this_len, "none\n"); + return this_len; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (preds[i]) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + this_len += sprintf(buf + this_len, + pred->or ? "|| " : "&& "); + this_len += sprintf(buf + this_len, + "%s ", field_name); + this_len += sprintf(buf + this_len, + pred->not ? "!= " : "== "); + if (pred->str_val) + this_len += sprintf(buf + this_len, + "%s\n", pred->str_val); + else + this_len += sprintf(buf + this_len, + "%llu\n", pred->val); + } else + break; + } + + return this_len; +} + +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *entry, *tmp; + + list_for_each_safe(entry, tmp, &call->fields) { + field = list_entry(entry, struct ftrace_event_field, link); + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +void filter_free_pred(struct filter_pred *pred) +{ + if (!pred) + return; + + kfree(pred->field_name); + kfree(pred->str_val); + kfree(pred); +} + +void filter_free_preds(struct ftrace_event_call *call) +{ + int i; + + if (call->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(call->preds[i]); + kfree(call->preds); + call->preds = NULL; + } +} + +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) +{ + int i; + + if (call->preds && !pred->compound) + filter_free_preds(call); + + if (!call->preds) { + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!call->preds[i]) { + call->preds[i] = pred; + return 0; + } + } + + return -ENOMEM; +} + +static int is_string_field(const char *type) +{ + if (strchr(type, '[') && strstr(type, "char")) + return 1; + + return 0; +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + struct ftrace_event_field *field; + + field = find_event_field(call, pred->field_name); + if (!field) + return -EINVAL; + + pred->offset = field->offset; + + if (is_string_field(field->type)) { + pred->fn = filter_pred_string; + pred->str_len = field->size; + return __filter_add_pred(call, pred); + } + + switch (field->size) { + case 8: + pred->fn = filter_pred_64; + break; + case 4: + pred->fn = filter_pred_32; + break; + case 2: + pred->fn = filter_pred_16; + break; + case 1: + pred->fn = filter_pred_8; + break; + default: + return -EINVAL; + } + + return __filter_add_pred(call, pred); +} + +int filter_parse(char **pbuf, struct filter_pred *pred) +{ + char *tmp, *tok, *val_str = NULL; + int tok_n = 0; + + /* field ==/!= number, or/and field ==/!= number, number */ + while ((tok = strsep(pbuf, " \n"))) { + if (tok_n == 0) { + if (!strcmp(tok, "0")) { + pred->clear = 1; + return 0; + } else if (!strcmp(tok, "&&")) { + pred->or = 0; + pred->compound = 1; + } else if (!strcmp(tok, "||")) { + pred->or = 1; + pred->compound = 1; + } else + pred->field_name = tok; + tok_n = 1; + continue; + } + if (tok_n == 1) { + if (!pred->field_name) + pred->field_name = tok; + else if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + tok_n = 2; + continue; + } + if (tok_n == 2) { + if (pred->compound) { + if (!strcmp(tok, "!=")) + pred->not = 1; + else if (!strcmp(tok, "==")) + pred->not = 0; + else { + pred->field_name = NULL; + return -EINVAL; + } + } else { + val_str = tok; + break; /* done */ + } + tok_n = 3; + continue; + } + if (tok_n == 3) { + val_str = tok; + break; /* done */ + } + } + + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!pred->field_name) + return -ENOMEM; + + pred->val = simple_strtoull(val_str, &tmp, 10); + if (tmp == val_str) { + pred->str_val = kstrdup(val_str, GFP_KERNEL); + if (!pred->str_val) + return -ENOMEM; + } + + return 0; +} + + diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 468938f70141..ebf215e87d5e 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ @@ -222,6 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ trace_current_buffer_unlock_commit(event, irq_flags, pc); \ + \ + if (call->preds && !filter_match_preds(call, entry)) \ + ring_buffer_event_discard(event); \ } \ \ static int ftrace_raw_reg_event_##call(void) \ -- cgit v1.2.3-59-g8ed1b From cfb180f3e71b2a280a254c8646a9ab1beab63f84 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 22 Mar 2009 03:31:17 -0500 Subject: tracing: add per-subsystem filtering This patch adds per-subsystem filtering to the event tracing subsystem. It adds a 'filter' debugfs file to each subsystem directory. This file can be written to to set filters; reading from it will display the current set of filters set for that subsystem. Basically what it does is propagate the filter down to each event contained in the subsystem. If a particular event doesn't have a field with the name specified in the filter, it simply doesn't get set for that event. You can verify whether or not the filter was set for a particular event by looking at the filter file for that event. As with per-event filters, compound expressions are supported, echoing '0' to the subsystem's filter file clears all filters in the subsystem, etc. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker LKML-Reference: <1237710677.7703.49.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 15 +++++++ kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++++++++++--- kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++++++++++++++ 3 files changed, 175 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d9eb39e4bb38..f267723c3c52 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,18 @@ struct ftrace_event_call { #endif }; +struct event_subsystem { + struct list_head list; + const char *name; + struct dentry *entry; + struct filter_pred **preds; +}; + +#define events_for_each(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + #define MAX_FILTER_PRED 8 struct filter_pred; @@ -832,6 +844,9 @@ extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_free_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred); void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 97470c48956e..97d4daaddd9a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -524,6 +524,71 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + r = filter_print_preds(system->preds, s->buffer); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + char buf[64], *pbuf = buf; + struct filter_pred *pred; + int err; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return -ENOMEM; + + err = filter_parse(&pbuf, pred); + if (err < 0) { + filter_free_pred(pred); + return err; + } + + if (pred->clear) { + filter_free_subsystem_preds(system); + return cnt; + } + + if (filter_add_subsystem_pred(system, pred)) { + filter_free_pred(pred); + return -EINVAL; + } + + *ppos += cnt; + + return cnt; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -575,6 +640,12 @@ static const struct file_operations ftrace_event_filter_fops = { .write = event_filter_write, }; +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = tracing_open_generic, + .read = subsystem_filter_read, + .write = subsystem_filter_write, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -595,18 +666,13 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -struct event_subsystem { - struct list_head list; - const char *name; - struct dentry *entry; -}; - static LIST_HEAD(event_subsystems); static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -633,6 +699,14 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->name = name; list_add(&system->list, &event_subsystems); + system->preds = NULL; + + entry = debugfs_create_file("filter", 0444, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8e8c5fa25be9..1ab20cee0e4c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -181,6 +181,27 @@ void filter_free_preds(struct ftrace_event_call *call) } } +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call = __start_ftrace_events; + int i; + + if (system->preds) { + for (i = 0; i < MAX_FILTER_PRED; i++) + filter_free_pred(system->preds[i]); + kfree(system->preds); + system->preds = NULL; + } + + events_for_each(call) { + if (!call->name || !call->regfunc) + continue; + + if (!strcmp(call->system, system->name)) + filter_free_preds(call); + } +} + static int __filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { @@ -250,6 +271,65 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return __filter_add_pred(call, pred); } +static struct filter_pred *copy_pred(struct filter_pred *pred) +{ + struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); + if (!new_pred) + return NULL; + + memcpy(new_pred, pred, sizeof(*pred)); + if (pred->str_val) { + new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); + new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!new_pred->str_val) { + kfree(new_pred); + return NULL; + } + } + + return new_pred; +} + +int filter_add_subsystem_pred(struct event_subsystem *system, + struct filter_pred *pred) +{ + struct ftrace_event_call *call = __start_ftrace_events; + struct filter_pred *event_pred; + int i; + + if (system->preds && !pred->compound) + filter_free_subsystem_preds(system); + + if (!system->preds) { + system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + GFP_KERNEL); + if (!system->preds) + return -ENOMEM; + } + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (!system->preds[i]) { + system->preds[i] = pred; + break; + } + if (i == MAX_FILTER_PRED - 1) + return -EINVAL; + } + + events_for_each(call) { + if (!call->name || !call->regfunc) + continue; + + if (!strcmp(call->system, system->name)) { + event_pred = copy_pred(pred); + if (event_pred) + filter_add_pred(call, event_pred); + } + } + + return 0; +} + int filter_parse(char **pbuf, struct filter_pred *pred) { char *tmp, *tok, *val_str = NULL; -- cgit v1.2.3-59-g8ed1b From fe9f57f250ab4d781b99504caeb218ca2db14c1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 22 Mar 2009 18:41:59 +0100 Subject: tracing: add run-time field descriptions for event filtering, kfree fix Impact: fix potential kfree of random data in (rare) failure path Zero-initialize the field structure. Reported-by: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <1237710639.7703.46.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 97d4daaddd9a..594d78aaa185 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -24,26 +24,31 @@ int trace_define_field(struct ftrace_event_call *call, char *type, { struct ftrace_event_field *field; - field = kmalloc(sizeof(*field), GFP_KERNEL); + field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) goto err; + field->name = kstrdup(name, GFP_KERNEL); if (!field->name) goto err; + field->type = kstrdup(type, GFP_KERNEL); if (!field->type) goto err; + field->offset = offset; field->size = size; list_add(&field->link, &call->fields); return 0; + err: if (field) { kfree(field->name); kfree(field->type); } kfree(field); + return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From 9bd7d099ab3f10dd666da399c064999bae427cd9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:43 +0100 Subject: tracing/events: make the filter files writable We need the filter files to be writable, the current filter file permissions are only set readable. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <1237759847-21025-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 594d78aaa185..19f61dd23219 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -706,7 +706,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; - entry = debugfs_create_file("filter", 0444, system->entry, system, + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); if (!entry) pr_warning("Could not create debugfs " @@ -769,7 +769,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) } } - entry = debugfs_create_file("filter", 0444, call->dir, call, + entry = debugfs_create_file("filter", 0644, call->dir, call, &ftrace_event_filter_fops); if (!entry) pr_warning("Could not create debugfs " -- cgit v1.2.3-59-g8ed1b From 07edf7121374609709ef1b0889f6e7b8d6a62ec1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:46 +0100 Subject: tracing/events: don't use wake up for events Impact: fix hard-lockup with sched switch events Some ftrace events, such as sched wakeup, can be traced while the runqueue lock is hold. Since they are using trace_current_buffer_unlock_commit(), they call wake_up() which can try to grab the runqueue lock too, resulting in a deadlock. Now for all event, we call a new helper: trace_nowake_buffer_unlock_commit() which do pretty the same than trace_current_buffer_unlock_commit() except than it doesn't call trace_wake_up(). Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237759847-21025-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 26 +++++++++++++++++++++----- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events_stage_3.h | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e6fac0ffe6f0..6bad12819eb6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -860,15 +860,25 @@ static void ftrace_trace_stack(struct trace_array *tr, static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc); -void trace_buffer_unlock_commit(struct trace_array *tr, - struct ring_buffer_event *event, - unsigned long flags, int pc) +static inline void __trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) { ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); - trace_wake_up(); + + if (wake) + trace_wake_up(); +} + +void trace_buffer_unlock_commit(struct trace_array *tr, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(tr, event, flags, pc, 1); } struct ring_buffer_event * @@ -882,7 +892,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return trace_buffer_unlock_commit(&global_trace, event, flags, pc); + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); +} + +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f267723c3c52..54fd9bcd0a65 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -483,6 +483,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index ebf215e87d5e..9a3bd49b52e5 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,7 +222,7 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - trace_current_buffer_unlock_commit(event, irq_flags, pc); \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ if (call->preds && !filter_match_preds(call, entry)) \ ring_buffer_event_discard(event); \ -- cgit v1.2.3-59-g8ed1b From 7e6ea92df3fd7cbe74e7985c6f3e40255c44b201 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:47 +0100 Subject: tracing/ftrace: make nop-tracer use polling wait for events on pipe Impact: display events when they arrive Now that the events don't use wake_up() anymore, we need the nop tracer to poll waiting for events on the pipe. Especially because nop is useful to look at orphan traces types (traces types that don't rely on specific tracers) because it doesn't produce traces itself. And unlike other tracers that trigger specific traces periodically, nop triggers no traces by itself that can wake him. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237759847-21025-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_nop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 9aa84bde23cd..394f94417e2f 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly = .name = "nop", .init = nop_trace_init, .reset = nop_trace_reset, + .wait_pipe = poll_wait_pipe, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_nop, #endif -- cgit v1.2.3-59-g8ed1b From b118415bfad6d75792a85ac999e25149db8e6919 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 Mar 2009 00:18:39 +0100 Subject: tracing/events: don't discard an event after commit When we want to filter an event, the filter test is done after the event is commited to the ring-buffer to be discarded later if needed. But a reader could be reading this event while we are trying to discard it. Other kind of racy events can even happen because the event is commited and can be read and/or consumed. What we want is to discard the event before committing it. Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <1237763919-21505-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_stage_3.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9a3bd49b52e5..9d2fa78cecca 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,10 +222,11 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ if (call->preds && !filter_match_preds(call, entry)) \ ring_buffer_event_discard(event); \ + \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + \ } \ \ static int ftrace_raw_reg_event_##call(void) \ -- cgit v1.2.3-59-g8ed1b From 75c8b417526529d0a7072e4d93ec99dbd483a6f4 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:28 -0500 Subject: tracing/filters: use list_for_each_entry_safe Impact: cleanup Use list_for_each_entry_safe instead of list_for_each_entry in find_event_field(). Reported-by: Frederic Weisbecker Signed-off-by: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <1237796788.7527.35.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1ab20cee0e4c..c4a413b70f78 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -147,11 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf) static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { - struct ftrace_event_field *field; - struct list_head *entry, *tmp; + struct ftrace_event_field *field, *next; - list_for_each_safe(entry, tmp, &call->fields) { - field = list_entry(entry, struct ftrace_event_field, link); + list_for_each_entry_safe(field, next, &call->fields, link) { if (!strcmp(field->name, name)) return field; } -- cgit v1.2.3-59-g8ed1b From ee6cdabc820a29bd607f38d9cb335c3ceddc673b Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:42 -0500 Subject: tracing/filters: fix bug in copy_pred() Impact: fix potential crash on subsystem filter expression freeing When making a copy of the predicate, pred->field_name needs to be duplicated in the copy as well, otherwise bad things can happen due to later multiple frees of the same string. This affects only per-subsystem event filtering. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237796802.7527.39.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c4a413b70f78..fd01d8022ad1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -276,11 +276,19 @@ static struct filter_pred *copy_pred(struct filter_pred *pred) return NULL; memcpy(new_pred, pred, sizeof(*pred)); + + if (pred->field_name) { + new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); + if (!new_pred->field_name) { + kfree(new_pred); + return NULL; + } + } + if (pred->str_val) { new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!new_pred->str_val) { - kfree(new_pred); + filter_free_pred(new_pred); return NULL; } } -- cgit v1.2.3-59-g8ed1b From c4cff064be678f1e8344d907499f2a81282edc19 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 23 Mar 2009 03:26:48 -0500 Subject: tracing/filters: clean up filter_add_subsystem_pred() Impact: cleanup, memory leak fix This patch cleans up filter_add_subsystem_pred(): - searches for the field before creating a copy of the pred - fixes memory leak in the case a predicate isn't applied - if -ENOMEM, makes sure there's no longer a reference to the pred so the caller can free the half-finished filter - changes the confusing i == MAX_FILTER_PRED - 1 comparison previously remarked upon This affects only per-subsystem event filtering. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237796808.7527.40.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 31 ++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 19f61dd23219..fdab30d6c835 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -585,6 +585,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (filter_add_subsystem_pred(system, pred)) { + filter_free_subsystem_preds(system); filter_free_pred(pred); return -EINVAL; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index fd01d8022ad1..4117c2ebc245 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -318,22 +318,39 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[i] = pred; break; } - if (i == MAX_FILTER_PRED - 1) - return -EINVAL; } + if (i == MAX_FILTER_PRED) + return -EINVAL; + events_for_each(call) { + int err; + if (!call->name || !call->regfunc) continue; - if (!strcmp(call->system, system->name)) { - event_pred = copy_pred(pred); - if (event_pred) - filter_add_pred(call, event_pred); - } + if (strcmp(call->system, system->name)) + continue; + + if (!find_event_field(call, pred->field_name)) + continue; + + event_pred = copy_pred(pred); + if (!event_pred) + goto oom; + + err = filter_add_pred(call, event_pred); + if (err) + filter_free_pred(event_pred); + if (err == -ENOMEM) + goto oom; } return 0; + +oom: + system->preds[i] = NULL; + return -ENOMEM; } int filter_parse(char **pbuf, struct filter_pred *pred) -- cgit v1.2.3-59-g8ed1b From a29f280b739985a9e829d67af4d08e6584153495 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 23 Mar 2009 14:57:38 +0200 Subject: [MTD] [OneNAND] omap2: panic_write may be in an interrupt context panic_write may read in an interrupt context. Signed-off-by: Adrian Hunter Signed-off-by: David Woodhouse --- drivers/mtd/onenand/omap2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c index 77a4f1446156..2c199b30ab58 100644 --- a/drivers/mtd/onenand/omap2.c +++ b/drivers/mtd/onenand/omap2.c @@ -294,6 +294,10 @@ static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area, if (bram_offset & 3 || (size_t)buf & 3 || count < 384) goto out_copy; + /* panic_write() may be in an interrupt context */ + if (in_interrupt()) + goto out_copy; + if (buf >= high_memory) { struct page *p1; -- cgit v1.2.3-59-g8ed1b From 9ce969082e490d0a5a81862b364337c93dc3482a Mon Sep 17 00:00:00 2001 From: Kyungmin Park Date: Mon, 17 Nov 2008 17:54:28 +0900 Subject: [MTD] [OneNAND] Add write-while-program support OneNAND write-while-program method of writing improves performance, compared with ordinary writes, by transferring data to OneNAND's RAM buffers atthe same time as programming the NAND core. When writing several NAND pages at a time, an improvement of 12% to 25% is seen. Signed-off-by: Kyungmin Park Signed-off-by: Adrian Hunter Signed-off-by: David Woodhouse --- drivers/mtd/onenand/onenand_base.c | 145 ++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 49 deletions(-) diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 529af271db17..30d6999e5f9f 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1455,7 +1455,8 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct onenand_chip *this = mtd->priv; - int written = 0, column, thislen, subpage; + int written = 0, column, thislen = 0, subpage = 0; + int prev = 0, prevlen = 0, prev_subpage = 0, first = 1; int oobwritten = 0, oobcolumn, thisooblen, oobsize; size_t len = ops->len; size_t ooblen = ops->ooblen; @@ -1482,6 +1483,10 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to, return -EINVAL; } + /* Check zero length */ + if (!len) + return 0; + if (ops->mode == MTD_OOB_AUTO) oobsize = this->ecclayout->oobavail; else @@ -1492,79 +1497,121 @@ static int onenand_write_ops_nolock(struct mtd_info *mtd, loff_t to, column = to & (mtd->writesize - 1); /* Loop until all data write */ - while (written < len) { - u_char *wbuf = (u_char *) buf; + while (1) { + if (written < len) { + u_char *wbuf = (u_char *) buf; - thislen = min_t(int, mtd->writesize - column, len - written); - thisooblen = min_t(int, oobsize - oobcolumn, ooblen - oobwritten); + thislen = min_t(int, mtd->writesize - column, len - written); + thisooblen = min_t(int, oobsize - oobcolumn, ooblen - oobwritten); - cond_resched(); + cond_resched(); - this->command(mtd, ONENAND_CMD_BUFFERRAM, to, thislen); + this->command(mtd, ONENAND_CMD_BUFFERRAM, to, thislen); - /* Partial page write */ - subpage = thislen < mtd->writesize; - if (subpage) { - memset(this->page_buf, 0xff, mtd->writesize); - memcpy(this->page_buf + column, buf, thislen); - wbuf = this->page_buf; - } + /* Partial page write */ + subpage = thislen < mtd->writesize; + if (subpage) { + memset(this->page_buf, 0xff, mtd->writesize); + memcpy(this->page_buf + column, buf, thislen); + wbuf = this->page_buf; + } - this->write_bufferram(mtd, ONENAND_DATARAM, wbuf, 0, mtd->writesize); + this->write_bufferram(mtd, ONENAND_DATARAM, wbuf, 0, mtd->writesize); - if (oob) { - oobbuf = this->oob_buf; + if (oob) { + oobbuf = this->oob_buf; - /* We send data to spare ram with oobsize - * to prevent byte access */ - memset(oobbuf, 0xff, mtd->oobsize); - if (ops->mode == MTD_OOB_AUTO) - onenand_fill_auto_oob(mtd, oobbuf, oob, oobcolumn, thisooblen); - else - memcpy(oobbuf + oobcolumn, oob, thisooblen); + /* We send data to spare ram with oobsize + * to prevent byte access */ + memset(oobbuf, 0xff, mtd->oobsize); + if (ops->mode == MTD_OOB_AUTO) + onenand_fill_auto_oob(mtd, oobbuf, oob, oobcolumn, thisooblen); + else + memcpy(oobbuf + oobcolumn, oob, thisooblen); - oobwritten += thisooblen; - oob += thisooblen; - oobcolumn = 0; + oobwritten += thisooblen; + oob += thisooblen; + oobcolumn = 0; + } else + oobbuf = (u_char *) ffchars; + + this->write_bufferram(mtd, ONENAND_SPARERAM, oobbuf, 0, mtd->oobsize); } else - oobbuf = (u_char *) ffchars; + ONENAND_SET_NEXT_BUFFERRAM(this); - this->write_bufferram(mtd, ONENAND_SPARERAM, oobbuf, 0, mtd->oobsize); + /* + * 2 PLANE, MLC, and Flex-OneNAND doesn't support + * write-while-programe feature. + */ + if (!ONENAND_IS_2PLANE(this) && !first) { + ONENAND_SET_PREV_BUFFERRAM(this); - this->command(mtd, ONENAND_CMD_PROG, to, mtd->writesize); + ret = this->wait(mtd, FL_WRITING); - ret = this->wait(mtd, FL_WRITING); + /* In partial page write we don't update bufferram */ + onenand_update_bufferram(mtd, prev, !ret && !prev_subpage); + if (ret) { + written -= prevlen; + printk(KERN_ERR "onenand_write_ops_nolock: write filaed %d\n", ret); + break; + } - /* In partial page write we don't update bufferram */ - onenand_update_bufferram(mtd, to, !ret && !subpage); - if (ONENAND_IS_2PLANE(this)) { - ONENAND_SET_BUFFERRAM1(this); - onenand_update_bufferram(mtd, to + this->writesize, !ret && !subpage); - } + if (written == len) { + /* Only check verify write turn on */ + ret = onenand_verify(mtd, buf - len, to - len, len); + if (ret) + printk(KERN_ERR "onenand_write_ops_nolock: verify failed %d\n", ret); + break; + } - if (ret) { - printk(KERN_ERR "onenand_write_ops_nolock: write filaed %d\n", ret); - break; + ONENAND_SET_NEXT_BUFFERRAM(this); } - /* Only check verify write turn on */ - ret = onenand_verify(mtd, buf, to, thislen); - if (ret) { - printk(KERN_ERR "onenand_write_ops_nolock: verify failed %d\n", ret); - break; - } + this->command(mtd, ONENAND_CMD_PROG, to, mtd->writesize); - written += thislen; + /* + * 2 PLANE, MLC, and Flex-OneNAND wait here + */ + if (ONENAND_IS_2PLANE(this)) { + ret = this->wait(mtd, FL_WRITING); - if (written == len) - break; + /* In partial page write we don't update bufferram */ + onenand_update_bufferram(mtd, to, !ret && !subpage); + if (ret) { + printk(KERN_ERR "onenand_write_ops_nolock: write filaed %d\n", ret); + break; + } + + /* Only check verify write turn on */ + ret = onenand_verify(mtd, buf, to, thislen); + if (ret) { + printk(KERN_ERR "onenand_write_ops_nolock: verify failed %d\n", ret); + break; + } + + written += thislen; + + if (written == len) + break; + + } else + written += thislen; column = 0; + prev_subpage = subpage; + prev = to; + prevlen = thislen; to += thislen; buf += thislen; + first = 0; } + /* In error case, clear all bufferrams */ + if (written != len) + onenand_invalidate_bufferram(mtd, 0, -1); + ops->retlen = written; + ops->oobretlen = oobwritten; return ret; } -- cgit v1.2.3-59-g8ed1b From c0f92ba99bdeaf35f9c580291b4e1a657c67fbd4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:44 +0100 Subject: debugfs: function to know if debugfs is initialized Impact: add new debugfs API With ftrace, some tracers are registered in early initcalls and attempt to create files on the debugfs filesystem. Depending on when they are activated, they can try to create their file at any time. Some checks can be done on the tracing area but providing a helper to know if debugfs is registered make it really more easy. Signed-off-by: Frederic Weisbecker Acked-by: Greg Kroah-Hartman Cc: Steven Rostedt LKML-Reference: <1237759847-21025-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/debugfs/inode.c | 16 ++++++++++++++++ include/linux/debugfs.h | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 81ae9ea3c6e1..0662ba6de85a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -30,6 +30,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; +static bool debugfs_registered; static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t dev) { @@ -496,6 +497,16 @@ exit: } EXPORT_SYMBOL_GPL(debugfs_rename); +/** + * debugfs_initialized - Tells whether debugfs has been registered + */ +bool debugfs_initialized(void) +{ + return debugfs_registered; +} +EXPORT_SYMBOL_GPL(debugfs_initialized); + + static struct kobject *debug_kobj; static int __init debugfs_init(void) @@ -509,11 +520,16 @@ static int __init debugfs_init(void) retval = register_filesystem(&debug_fs_type); if (retval) kobject_put(debug_kobj); + else + debugfs_registered = true; + return retval; } static void __exit debugfs_exit(void) { + debugfs_registered = false; + simple_release_fs(&debugfs_mount, &debugfs_mount_count); unregister_filesystem(&debug_fs_type); kobject_put(debug_kobj); diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index af0e01d4c663..eb5c2ba2f81a 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -71,6 +71,9 @@ struct dentry *debugfs_create_bool(const char *name, mode_t mode, struct dentry *debugfs_create_blob(const char *name, mode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob); + +bool debugfs_initialized(void); + #else #include @@ -183,6 +186,11 @@ static inline struct dentry *debugfs_create_blob(const char *name, mode_t mode, return ERR_PTR(-ENODEV); } +static inline bool debugfs_initialized(void) +{ + return false; +} + #endif #endif -- cgit v1.2.3-59-g8ed1b From 3e1f60b80cafcb5d7e8d3665b35962fbb8fb9efa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 22 Mar 2009 23:10:45 +0100 Subject: tracing/ftrace: check if debugfs is registered before creating files Impact: fix a crash with ftrace={nop,boot} parameter If the nop or initcall tracers are launched as boot tracers, they will attempt to create their option directory and files. But these tracers are registered very early and then assigned as "boot tracers" very early if asked to. Since they do this before debugfs has been registered (core initcall), a crash is triggered. Another early tracers could also come later. So we fix it by checking if debugfs is initialized before creating the root tracing directory. Signed-off-by: Frederic Weisbecker Cc: Greg Kroah-Hartman Cc: Steven Rostedt LKML-Reference: <1237759847-21025-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ace685c70186..f0e1337b1ebd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3513,6 +3513,9 @@ struct dentry *tracing_init_dentry(void) if (d_tracer) return d_tracer; + if (!debugfs_initialized()) + return NULL; + d_tracer = debugfs_create_dir("tracing", NULL); if (!d_tracer && !once) { -- cgit v1.2.3-59-g8ed1b From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 24 Mar 2009 01:07:24 +0300 Subject: tracing: Fix TRACING_SUPPORT dependency for PPC32 commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"), despite the "clean up" in its purpose, introduced a behavioural change for Kconfig symbols: we no longer able to select tracing support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented). The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core has a special case for platforms w/o irqflags (which, by the way, has become useless as of the commit above). Though according to Ingo Molnar, there was periodic build failures on weird, unmaintained architectures that had no irqflags-tracing support and hence didn't know the raw_irqs_save/restore primitives. Thus we'd better not enable irqflags-less tracing for all architectures. This patch restores the old behaviour for PPC32, and thus brings the tracing back. Other architectures can either add themselves to the exception list or (better) implement TRACE_IRQFLAGS_SUPPORT. Signed-off-by: Anton Vorontsov Acked-b: Steven Rostedt Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b0a46f889659..8a4d72931042 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -63,7 +63,11 @@ config TRACING # config TRACING_SUPPORT bool - depends on TRACE_IRQFLAGS_SUPPORT + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 depends on STACKTRACE_SUPPORT default y -- cgit v1.2.3-59-g8ed1b From 1618536961d31f9b3f55767b22d4a897f4204c26 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 23 Mar 2009 22:17:01 +0100 Subject: tracing/function-graph-tracer: fix functions call traces imbalance Impact: fix traces output Sometimes one can observe an imbalance in the traces between function calls and function return traces: func1() { } } The curly brace inside func1() is the return of another function nested inside func1. The return trace have been inserted in the buffer but not the entry. We are storing a return address on the function traces stack while we haven't inserted its entry on the buffer, hence the imbalance on the traces. This is because the tracers doesn't check all failures that can happen on buffer insertion. This patch reports the tracing recursion failures and the ring buffer failures. In such cases, we now restore the original return address for the function, giving up its return trace. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1237843021-11695-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6bad12819eb6..89f0c2544ad0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -924,7 +924,7 @@ trace_function(struct trace_array *tr, } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void __trace_graph_entry(struct trace_array *tr, +static int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, int pc) @@ -933,15 +933,17 @@ static void __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent_entry *entry; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) - return; + return 0; event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) - return; + return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; ring_buffer_unlock_commit(global_trace.buffer, event); + + return 1; } static void __trace_graph_return(struct trace_array *tr, @@ -1162,6 +1164,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) struct trace_array_cpu *data; unsigned long flags; long disabled; + int ret; int cpu; int pc; @@ -1177,15 +1180,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_graph_entry(tr, trace, flags, pc); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; } /* Only do the atomic if it is not already set */ if (!test_tsk_trace_graph(current)) set_tsk_trace_graph(current); + atomic_dec(&data->disabled); local_irq_restore(flags); - return 1; + return ret; } void trace_graph_return(struct ftrace_graph_ret *trace) -- cgit v1.2.3-59-g8ed1b From dffc9efc562b8a2180e86df9f733cb113ba94b33 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 28 Jan 2009 17:07:19 +1000 Subject: m68knommu: mark all RAM as ZONE_DMA There is no reason not to put all RAM in ZONE_DMA for these simple m68k varients (same as the standard MMU m68k code does). With this in place the usual dma_alloc_coherent() work as expected. Signed-off-by: Greg Ungerer --- arch/m68knommu/mm/init.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c index 3bf249c53e41..7befc0c357e0 100644 --- a/arch/m68knommu/mm/init.c +++ b/arch/m68knommu/mm/init.c @@ -111,11 +111,7 @@ void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; - zones_size[ZONE_DMA] = 0 >> PAGE_SHIFT; - zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = 0; -#endif + zones_size[ZONE_DMA] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT; free_area_init(zones_size); } } -- cgit v1.2.3-59-g8ed1b From bf5fe9ed73c2e696d3256ff18f926b103097abd2 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 28 Jan 2009 17:29:35 +1000 Subject: m68knommu: add a local dma_sync_single_for_cpu() function The onboard ethernet of many ColdFire parts uses DMA. The driver is being cleaned up to use the correct DMA handling functions, and m68knommuy currently does not implement dma_sync_single_for_cpu(). Signed-off-by: Greg Ungerer --- arch/m68knommu/kernel/dma.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/m68knommu/kernel/dma.c b/arch/m68knommu/kernel/dma.c index e10eafc52789..936125806638 100644 --- a/arch/m68knommu/kernel/dma.c +++ b/arch/m68knommu/kernel/dma.c @@ -9,10 +9,11 @@ #include #include #include +#include #include void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; /* ignore region specifiers */ @@ -34,3 +35,8 @@ void dma_free_coherent(struct device *dev, size_t size, { free_pages((unsigned long)vaddr, get_order(size)); } + +void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir) +{ +} + -- cgit v1.2.3-59-g8ed1b From 76fdb7d6bd7922353abc8a1407c6dde3789042ce Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:33:15 +1000 Subject: m68k: use the mmu pci.h for m68knommu as well The (m68knommu) COMEMPCI support has been removed from the kernel, so now the mmu pci.h can be used on non-mmu setups as well. Remove the non-mmu pci_no.h and revert the pci_mm.h to be pci.h. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/pci.h | 17 ++++++++++++----- arch/m68k/include/asm/pci_mm.h | 12 ------------ arch/m68k/include/asm/pci_no.h | 29 ----------------------------- 3 files changed, 12 insertions(+), 46 deletions(-) delete mode 100644 arch/m68k/include/asm/pci_mm.h delete mode 100644 arch/m68k/include/asm/pci_no.h diff --git a/arch/m68k/include/asm/pci.h b/arch/m68k/include/asm/pci.h index dbea95373080..4ad0aea48ab4 100644 --- a/arch/m68k/include/asm/pci.h +++ b/arch/m68k/include/asm/pci.h @@ -1,5 +1,12 @@ -#ifdef __uClinux__ -#include "pci_no.h" -#else -#include "pci_mm.h" -#endif +#ifndef _ASM_M68K_PCI_H +#define _ASM_M68K_PCI_H + +#include + +/* The PCI address space does equal the physical memory + * address space. The networking and block device layers use + * this boolean for bounce buffer decisions. + */ +#define PCI_DMA_BUS_IS_PHYS (1) + +#endif /* _ASM_M68K_PCI_H */ diff --git a/arch/m68k/include/asm/pci_mm.h b/arch/m68k/include/asm/pci_mm.h deleted file mode 100644 index 4ad0aea48ab4..000000000000 --- a/arch/m68k/include/asm/pci_mm.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_M68K_PCI_H -#define _ASM_M68K_PCI_H - -#include - -/* The PCI address space does equal the physical memory - * address space. The networking and block device layers use - * this boolean for bounce buffer decisions. - */ -#define PCI_DMA_BUS_IS_PHYS (1) - -#endif /* _ASM_M68K_PCI_H */ diff --git a/arch/m68k/include/asm/pci_no.h b/arch/m68k/include/asm/pci_no.h deleted file mode 100644 index 9abbc03c73ee..000000000000 --- a/arch/m68k/include/asm/pci_no.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef M68KNOMMU_PCI_H -#define M68KNOMMU_PCI_H - -#include - -#ifdef CONFIG_COMEMPCI -/* - * These are pretty much arbitary with the CoMEM implementation. - * We have the whole address space to ourselves. - */ -#define PCIBIOS_MIN_IO 0x100 -#define PCIBIOS_MIN_MEM 0x00010000 - -#define pcibios_scan_all_fns(a, b) 0 - -/* - * Return whether the given PCI device DMA address mask can - * be supported properly. For example, if your device can - * only drive the low 24-bits during PCI bus mastering, then - * you would pass 0x00ffffff as the mask to this function. - */ -static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) -{ - return 1; -} - -#endif /* CONFIG_COMEMPCI */ - -#endif /* M68KNOMMU_PCI_H */ -- cgit v1.2.3-59-g8ed1b From 9a4048a211513c3d6c56ddf2efb276113eae0b80 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:35:34 +1000 Subject: m68k: swtich non-mmu setups to use the mmu dma-mapping.h The mmu version of dma-mapping.h (which is dma-mapping_mm.h) is clean to be used for non-mmu setups as well. Remove dma-mapping_no.h and revert dma-mapping_mm.h to dma-mapping.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/dma-mapping.h | 113 ++++++++++++++++++++++++++++++++- arch/m68k/include/asm/dma-mapping_mm.h | 112 -------------------------------- arch/m68k/include/asm/dma-mapping_no.h | 6 -- 3 files changed, 110 insertions(+), 121 deletions(-) delete mode 100644 arch/m68k/include/asm/dma-mapping_mm.h delete mode 100644 arch/m68k/include/asm/dma-mapping_no.h diff --git a/arch/m68k/include/asm/dma-mapping.h b/arch/m68k/include/asm/dma-mapping.h index f4a4c7638f89..26f505488c11 100644 --- a/arch/m68k/include/asm/dma-mapping.h +++ b/arch/m68k/include/asm/dma-mapping.h @@ -1,5 +1,112 @@ -#ifdef __uClinux__ -#include "dma-mapping_no.h" +#ifndef _M68K_DMA_MAPPING_H +#define _M68K_DMA_MAPPING_H + +#include + +struct scatterlist; + +#ifndef CONFIG_MMU_SUN3 +static inline int dma_supported(struct device *dev, u64 mask) +{ + return 1; +} + +static inline int dma_set_mask(struct device *dev, u64 mask) +{ + return 0; +} + +static inline int dma_get_cache_alignment(void) +{ + return 1 << L1_CACHE_SHIFT; +} + +static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr) +{ + return 0; +} + +extern void *dma_alloc_coherent(struct device *, size_t, + dma_addr_t *, gfp_t); +extern void dma_free_coherent(struct device *, size_t, + void *, dma_addr_t); + +static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t flag) +{ + return dma_alloc_coherent(dev, size, handle, flag); +} +static inline void dma_free_noncoherent(struct device *dev, size_t size, + void *addr, dma_addr_t handle) +{ + dma_free_coherent(dev, size, addr, handle); +} +static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction dir) +{ + /* we use coherent allocation, so not much to do here. */ +} + +extern dma_addr_t dma_map_single(struct device *, void *, size_t, + enum dma_data_direction); +static inline void dma_unmap_single(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir) +{ +} + +extern dma_addr_t dma_map_page(struct device *, struct page *, + unsigned long, size_t size, + enum dma_data_direction); +static inline void dma_unmap_page(struct device *dev, dma_addr_t address, + size_t size, enum dma_data_direction dir) +{ +} + +extern int dma_map_sg(struct device *, struct scatterlist *, int, + enum dma_data_direction); +static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, + int nhwentries, enum dma_data_direction dir) +{ +} + +extern void dma_sync_single_for_device(struct device *, dma_addr_t, size_t, + enum dma_data_direction); +extern void dma_sync_sg_for_device(struct device *, struct scatterlist *, int, + enum dma_data_direction); + +static inline void dma_sync_single_range_for_device(struct device *dev, + dma_addr_t dma_handle, unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + /* just sync everything for now */ + dma_sync_single_for_device(dev, dma_handle, offset + size, direction); +} + +static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir) +{ +} + +static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir) +{ +} + +static inline void dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t dma_handle, unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + /* just sync everything for now */ + dma_sync_single_for_cpu(dev, dma_handle, offset + size, direction); +} + +static inline int dma_mapping_error(struct device *dev, dma_addr_t handle) +{ + return 0; +} + #else -#include "dma-mapping_mm.h" +#include #endif + +#endif /* _M68K_DMA_MAPPING_H */ diff --git a/arch/m68k/include/asm/dma-mapping_mm.h b/arch/m68k/include/asm/dma-mapping_mm.h deleted file mode 100644 index 26f505488c11..000000000000 --- a/arch/m68k/include/asm/dma-mapping_mm.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef _M68K_DMA_MAPPING_H -#define _M68K_DMA_MAPPING_H - -#include - -struct scatterlist; - -#ifndef CONFIG_MMU_SUN3 -static inline int dma_supported(struct device *dev, u64 mask) -{ - return 1; -} - -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - return 0; -} - -static inline int dma_get_cache_alignment(void) -{ - return 1 << L1_CACHE_SHIFT; -} - -static inline int dma_is_consistent(struct device *dev, dma_addr_t dma_addr) -{ - return 0; -} - -extern void *dma_alloc_coherent(struct device *, size_t, - dma_addr_t *, gfp_t); -extern void dma_free_coherent(struct device *, size_t, - void *, dma_addr_t); - -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t flag) -{ - return dma_alloc_coherent(dev, size, handle, flag); -} -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *addr, dma_addr_t handle) -{ - dma_free_coherent(dev, size, addr, handle); -} -static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, - enum dma_data_direction dir) -{ - /* we use coherent allocation, so not much to do here. */ -} - -extern dma_addr_t dma_map_single(struct device *, void *, size_t, - enum dma_data_direction); -static inline void dma_unmap_single(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) -{ -} - -extern dma_addr_t dma_map_page(struct device *, struct page *, - unsigned long, size_t size, - enum dma_data_direction); -static inline void dma_unmap_page(struct device *dev, dma_addr_t address, - size_t size, enum dma_data_direction dir) -{ -} - -extern int dma_map_sg(struct device *, struct scatterlist *, int, - enum dma_data_direction); -static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, enum dma_data_direction dir) -{ -} - -extern void dma_sync_single_for_device(struct device *, dma_addr_t, size_t, - enum dma_data_direction); -extern void dma_sync_sg_for_device(struct device *, struct scatterlist *, int, - enum dma_data_direction); - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything for now */ - dma_sync_single_for_device(dev, dma_handle, offset + size, direction); -} - -static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir) -{ -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir) -{ -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - /* just sync everything for now */ - dma_sync_single_for_cpu(dev, dma_handle, offset + size, direction); -} - -static inline int dma_mapping_error(struct device *dev, dma_addr_t handle) -{ - return 0; -} - -#else -#include -#endif - -#endif /* _M68K_DMA_MAPPING_H */ diff --git a/arch/m68k/include/asm/dma-mapping_no.h b/arch/m68k/include/asm/dma-mapping_no.h deleted file mode 100644 index 1748f2bca940..000000000000 --- a/arch/m68k/include/asm/dma-mapping_no.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _M68KNOMMU_DMA_MAPPING_H -#define _M68KNOMMU_DMA_MAPPING_H - -#include - -#endif /* _M68KNOMMU_DMA_MAPPING_H */ -- cgit v1.2.3-59-g8ed1b From bf08d5251840ee7fe7ac561fc65aeead4f31335b Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:38:57 +1000 Subject: m68k: use the mc146818rtc.h for non-mmu setups as well. The mmu varient of mc146818rtc.h can be use on the non-mmu builds as well. Revert to the single mc146818rtc.h file. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/mc146818rtc.h | 31 ++++++++++++++++++++++++++----- arch/m68k/include/asm/mc146818rtc_mm.h | 26 -------------------------- arch/m68k/include/asm/mc146818rtc_no.h | 9 --------- 3 files changed, 26 insertions(+), 40 deletions(-) delete mode 100644 arch/m68k/include/asm/mc146818rtc_mm.h delete mode 100644 arch/m68k/include/asm/mc146818rtc_no.h diff --git a/arch/m68k/include/asm/mc146818rtc.h b/arch/m68k/include/asm/mc146818rtc.h index fb90dcf78426..9f70a01f73dc 100644 --- a/arch/m68k/include/asm/mc146818rtc.h +++ b/arch/m68k/include/asm/mc146818rtc.h @@ -1,5 +1,26 @@ -#ifdef __uClinux__ -#include "mc146818rtc_no.h" -#else -#include "mc146818rtc_mm.h" -#endif +/* + * Machine dependent access functions for RTC registers. + */ +#ifndef _ASM_MC146818RTC_H +#define _ASM_MC146818RTC_H + + +#ifdef CONFIG_ATARI +/* RTC in Atari machines */ + +#include + +#define RTC_PORT(x) (TT_RTC_BAS + 2*(x)) +#define RTC_ALWAYS_BCD 0 + +#define CMOS_READ(addr) ({ \ +atari_outb_p((addr),RTC_PORT(0)); \ +atari_inb_p(RTC_PORT(1)); \ +}) +#define CMOS_WRITE(val, addr) ({ \ +atari_outb_p((addr),RTC_PORT(0)); \ +atari_outb_p((val),RTC_PORT(1)); \ +}) +#endif /* CONFIG_ATARI */ + +#endif /* _ASM_MC146818RTC_H */ diff --git a/arch/m68k/include/asm/mc146818rtc_mm.h b/arch/m68k/include/asm/mc146818rtc_mm.h deleted file mode 100644 index 9f70a01f73dc..000000000000 --- a/arch/m68k/include/asm/mc146818rtc_mm.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Machine dependent access functions for RTC registers. - */ -#ifndef _ASM_MC146818RTC_H -#define _ASM_MC146818RTC_H - - -#ifdef CONFIG_ATARI -/* RTC in Atari machines */ - -#include - -#define RTC_PORT(x) (TT_RTC_BAS + 2*(x)) -#define RTC_ALWAYS_BCD 0 - -#define CMOS_READ(addr) ({ \ -atari_outb_p((addr),RTC_PORT(0)); \ -atari_inb_p(RTC_PORT(1)); \ -}) -#define CMOS_WRITE(val, addr) ({ \ -atari_outb_p((addr),RTC_PORT(0)); \ -atari_outb_p((val),RTC_PORT(1)); \ -}) -#endif /* CONFIG_ATARI */ - -#endif /* _ASM_MC146818RTC_H */ diff --git a/arch/m68k/include/asm/mc146818rtc_no.h b/arch/m68k/include/asm/mc146818rtc_no.h deleted file mode 100644 index 907a0481a140..000000000000 --- a/arch/m68k/include/asm/mc146818rtc_no.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Machine dependent access functions for RTC registers. - */ -#ifndef _M68KNOMMU_MC146818RTC_H -#define _M68KNOMMU_MC146818RTC_H - -/* empty include file to satisfy the include in genrtc.c/ide-geometry.c */ - -#endif /* _M68KNOMMU_MC146818RTC_H */ -- cgit v1.2.3-59-g8ed1b From 74870998bb8e5914016bed4396b28a468a2da5d3 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:46:06 +1000 Subject: m68k: use mmu kmap_types.h for non-mmu setups as well The mmu version of kmap_types.h is identical to the non-mmu one. Revert to a single file. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/kmap_types.h | 26 +++++++++++++++++++++----- arch/m68k/include/asm/kmap_types_mm.h | 21 --------------------- arch/m68k/include/asm/kmap_types_no.h | 21 --------------------- 3 files changed, 21 insertions(+), 47 deletions(-) delete mode 100644 arch/m68k/include/asm/kmap_types_mm.h delete mode 100644 arch/m68k/include/asm/kmap_types_no.h diff --git a/arch/m68k/include/asm/kmap_types.h b/arch/m68k/include/asm/kmap_types.h index 045d9fd122a2..c843c63d3801 100644 --- a/arch/m68k/include/asm/kmap_types.h +++ b/arch/m68k/include/asm/kmap_types.h @@ -1,5 +1,21 @@ -#ifdef __uClinux__ -#include "kmap_types_no.h" -#else -#include "kmap_types_mm.h" -#endif +#ifndef __ASM_M68K_KMAP_TYPES_H +#define __ASM_M68K_KMAP_TYPES_H + +enum km_type { + KM_BOUNCE_READ, + KM_SKB_SUNRPC_DATA, + KM_SKB_DATA_SOFTIRQ, + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, + KM_IRQ0, + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, + KM_TYPE_NR +}; + +#endif /* __ASM_M68K_KMAP_TYPES_H */ diff --git a/arch/m68k/include/asm/kmap_types_mm.h b/arch/m68k/include/asm/kmap_types_mm.h deleted file mode 100644 index c843c63d3801..000000000000 --- a/arch/m68k/include/asm/kmap_types_mm.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __ASM_M68K_KMAP_TYPES_H -#define __ASM_M68K_KMAP_TYPES_H - -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; - -#endif /* __ASM_M68K_KMAP_TYPES_H */ diff --git a/arch/m68k/include/asm/kmap_types_no.h b/arch/m68k/include/asm/kmap_types_no.h deleted file mode 100644 index bfb6707575d1..000000000000 --- a/arch/m68k/include/asm/kmap_types_no.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __ASM_M68K_KMAP_TYPES_H -#define __ASM_M68K_KMAP_TYPES_H - -enum km_type { - KM_BOUNCE_READ, - KM_SKB_SUNRPC_DATA, - KM_SKB_DATA_SOFTIRQ, - KM_USER0, - KM_USER1, - KM_BIO_SRC_IRQ, - KM_BIO_DST_IRQ, - KM_PTE0, - KM_PTE1, - KM_IRQ0, - KM_IRQ1, - KM_SOFTIRQ0, - KM_SOFTIRQ1, - KM_TYPE_NR -}; - -#endif -- cgit v1.2.3-59-g8ed1b From bf7058f04ab74061b75a166e30ae0a7e617f0430 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:49:27 +1000 Subject: m68knommu: remove no longer used mcfpci.h The mcfpci.h was only used by the removed (m68knommu) COMEMPCI code. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/mcfpci.h | 119 ----------------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 arch/m68k/include/asm/mcfpci.h diff --git a/arch/m68k/include/asm/mcfpci.h b/arch/m68k/include/asm/mcfpci.h deleted file mode 100644 index f1507dd06ec6..000000000000 --- a/arch/m68k/include/asm/mcfpci.h +++ /dev/null @@ -1,119 +0,0 @@ -/****************************************************************************/ - -/* - * mcfpci.h -- PCI bridge on ColdFire eval boards. - * - * (C) Copyright 2000, Greg Ungerer (gerg@snapgear.com) - * (C) Copyright 2000, Lineo Inc. (www.lineo.com) - */ - -/****************************************************************************/ -#ifndef mcfpci_h -#define mcfpci_h -/****************************************************************************/ - - -#ifdef CONFIG_PCI - -/* - * Address regions in the PCI address space are not mapped into the - * normal memory space of the ColdFire. They must be accessed via - * handler routines. This is easy for I/O space (inb/outb/etc) but - * needs some code changes to support ordinary memory. Interrupts - * also need to be vectored through the PCI handler first, then it - * will call the actual driver sub-handlers. - */ - -/* - * Un-define all the standard I/O access routines. - */ -#undef inb -#undef inw -#undef inl -#undef inb_p -#undef inw_p -#undef insb -#undef insw -#undef insl -#undef outb -#undef outw -#undef outl -#undef outb_p -#undef outw_p -#undef outsb -#undef outsw -#undef outsl - -#undef request_irq -#undef free_irq - -#undef bus_to_virt -#undef virt_to_bus - - -/* - * Re-direct all I/O memory accesses functions to PCI specific ones. - */ -#define inb pci_inb -#define inw pci_inw -#define inl pci_inl -#define inb_p pci_inb -#define inw_p pci_inw -#define insb pci_insb -#define insw pci_insw -#define insl pci_insl - -#define outb pci_outb -#define outw pci_outw -#define outl pci_outl -#define outb_p pci_outb -#define outw_p pci_outw -#define outsb pci_outsb -#define outsw pci_outsw -#define outsl pci_outsl - -#define request_irq pci_request_irq -#define free_irq pci_free_irq - -#define virt_to_bus pci_virt_to_bus -#define bus_to_virt pci_bus_to_virt - -#define CONFIG_COMEMPCI 1 - - -/* - * Prototypes of the real PCI functions (defined in bios32.c). - */ -unsigned char pci_inb(unsigned int addr); -unsigned short pci_inw(unsigned int addr); -unsigned int pci_inl(unsigned int addr); -void pci_insb(void *addr, void *buf, int len); -void pci_insw(void *addr, void *buf, int len); -void pci_insl(void *addr, void *buf, int len); - -void pci_outb(unsigned char val, unsigned int addr); -void pci_outw(unsigned short val, unsigned int addr); -void pci_outl(unsigned int val, unsigned int addr); -void pci_outsb(void *addr, void *buf, int len); -void pci_outsw(void *addr, void *buf, int len); -void pci_outsl(void *addr, void *buf, int len); - -int pci_request_irq(unsigned int irq, - void (*handler)(int, void *, struct pt_regs *), - unsigned long flags, - const char *device, - void *dev_id); -void pci_free_irq(unsigned int irq, void *dev_id); - -void *pci_bmalloc(int size); -void pci_bmfree(void *bmp, int len); -void pci_copytoshmem(unsigned long bmp, void *src, int size); -void pci_copyfromshmem(void *dst, unsigned long bmp, int size); -unsigned long pci_virt_to_bus(volatile void *address); -void *pci_bus_to_virt(unsigned long address); -void pci_bmcpyto(void *dst, void *src, int len); -void pci_bmcpyfrom(void *dst, void *src, int len); - -#endif /* CONFIG_PCI */ -/****************************************************************************/ -#endif /* mcfpci_h */ -- cgit v1.2.3-59-g8ed1b From 54cae79e52d6a72faa3d6993dcaba560bfac8db3 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 15:57:30 +1000 Subject: m68k: merge the mmu and non-mmu versions of mmu.h Trivial merge of the contents of mmu and non-mmu versions of mmu.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/mmu.h | 14 +++++++++++--- arch/m68k/include/asm/mmu_mm.h | 7 ------- arch/m68k/include/asm/mmu_no.h | 10 ---------- 3 files changed, 11 insertions(+), 20 deletions(-) delete mode 100644 arch/m68k/include/asm/mmu_mm.h delete mode 100644 arch/m68k/include/asm/mmu_no.h diff --git a/arch/m68k/include/asm/mmu.h b/arch/m68k/include/asm/mmu.h index a81d3946675f..8a11a63ee15a 100644 --- a/arch/m68k/include/asm/mmu.h +++ b/arch/m68k/include/asm/mmu.h @@ -1,5 +1,13 @@ -#ifdef __uClinux__ -#include "mmu_no.h" +#ifndef __MMU_H +#define __MMU_H + +#ifdef CONFIG_MMU +/* Default "unsigned long" context */ +typedef unsigned long mm_context_t; #else -#include "mmu_mm.h" +typedef struct { + unsigned long end_brk; +} mm_context_t; +#endif + #endif diff --git a/arch/m68k/include/asm/mmu_mm.h b/arch/m68k/include/asm/mmu_mm.h deleted file mode 100644 index ccd36d26615a..000000000000 --- a/arch/m68k/include/asm/mmu_mm.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __MMU_H -#define __MMU_H - -/* Default "unsigned long" context */ -typedef unsigned long mm_context_t; - -#endif diff --git a/arch/m68k/include/asm/mmu_no.h b/arch/m68k/include/asm/mmu_no.h deleted file mode 100644 index e2da1e6f09fe..000000000000 --- a/arch/m68k/include/asm/mmu_no.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef __M68KNOMMU_MMU_H -#define __M68KNOMMU_MMU_H - -/* Copyright (C) 2002, David McCullough */ - -typedef struct { - unsigned long end_brk; -} mm_context_t; - -#endif /* __M68KNOMMU_MMU_H */ -- cgit v1.2.3-59-g8ed1b From 375d1c7e2d56603406a72c67229734c61ebc6735 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 16:05:35 +1000 Subject: m68k: use mmu scatterlist.h for non-mmu setups as well There is only trivial differences between the non-mmu and mmu versions of scatterlist.h. So use the mmu one and remove the non-mmu one. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/scatterlist.h | 26 ++++++++++++++++++++++---- arch/m68k/include/asm/scatterlist_mm.h | 23 ----------------------- arch/m68k/include/asm/scatterlist_no.h | 22 ---------------------- 3 files changed, 22 insertions(+), 49 deletions(-) delete mode 100644 arch/m68k/include/asm/scatterlist_mm.h delete mode 100644 arch/m68k/include/asm/scatterlist_no.h diff --git a/arch/m68k/include/asm/scatterlist.h b/arch/m68k/include/asm/scatterlist.h index b7e528636252..d3a7a0edfeca 100644 --- a/arch/m68k/include/asm/scatterlist.h +++ b/arch/m68k/include/asm/scatterlist.h @@ -1,5 +1,23 @@ -#ifdef __uClinux__ -#include "scatterlist_no.h" -#else -#include "scatterlist_mm.h" +#ifndef _M68K_SCATTERLIST_H +#define _M68K_SCATTERLIST_H + +#include + +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; #endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + + __u32 dma_address; /* A place to hang host-specific addresses at. */ +}; + +/* This is bogus and should go away. */ +#define ISA_DMA_THRESHOLD (0x00ffffff) + +#define sg_dma_address(sg) ((sg)->dma_address) +#define sg_dma_len(sg) ((sg)->length) + +#endif /* !(_M68K_SCATTERLIST_H) */ diff --git a/arch/m68k/include/asm/scatterlist_mm.h b/arch/m68k/include/asm/scatterlist_mm.h deleted file mode 100644 index d3a7a0edfeca..000000000000 --- a/arch/m68k/include/asm/scatterlist_mm.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _M68K_SCATTERLIST_H -#define _M68K_SCATTERLIST_H - -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - - __u32 dma_address; /* A place to hang host-specific addresses at. */ -}; - -/* This is bogus and should go away. */ -#define ISA_DMA_THRESHOLD (0x00ffffff) - -#define sg_dma_address(sg) ((sg)->dma_address) -#define sg_dma_len(sg) ((sg)->length) - -#endif /* !(_M68K_SCATTERLIST_H) */ diff --git a/arch/m68k/include/asm/scatterlist_no.h b/arch/m68k/include/asm/scatterlist_no.h deleted file mode 100644 index afc4788b0d2c..000000000000 --- a/arch/m68k/include/asm/scatterlist_no.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _M68KNOMMU_SCATTERLIST_H -#define _M68KNOMMU_SCATTERLIST_H - -#include -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - dma_addr_t dma_address; - unsigned int length; -}; - -#define sg_dma_address(sg) ((sg)->dma_address) -#define sg_dma_len(sg) ((sg)->length) - -#define ISA_DMA_THRESHOLD (0xffffffff) - -#endif /* !(_M68KNOMMU_SCATTERLIST_H) */ -- cgit v1.2.3-59-g8ed1b From 34055b806a6334624e7e8af6eefc3aee42372a85 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 16:08:53 +1000 Subject: m68k: use mmu fpu.h for non-mmu builds as well None of the currently support m68knommu targets have an FPU. Use the mmu version of fpu.h for all m68k. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/fpu.h | 22 +++++++++++++++++++--- arch/m68k/include/asm/fpu_mm.h | 21 --------------------- arch/m68k/include/asm/fpu_no.h | 21 --------------------- 3 files changed, 19 insertions(+), 45 deletions(-) delete mode 100644 arch/m68k/include/asm/fpu_mm.h delete mode 100644 arch/m68k/include/asm/fpu_no.h diff --git a/arch/m68k/include/asm/fpu.h b/arch/m68k/include/asm/fpu.h index e19bc5ed9c37..ffb6b8cfc6d5 100644 --- a/arch/m68k/include/asm/fpu.h +++ b/arch/m68k/include/asm/fpu.h @@ -1,5 +1,21 @@ -#ifdef __uClinux__ -#include "fpu_no.h" +#ifndef __M68K_FPU_H +#define __M68K_FPU_H + + +/* + * MAX floating point unit state size (FSAVE/FRESTORE) + */ + +#if defined(CONFIG_M68020) || defined(CONFIG_M68030) +#define FPSTATESIZE (216) +#elif defined(CONFIG_M68040) +#define FPSTATESIZE (96) +#elif defined(CONFIG_M68KFPU_EMU) +#define FPSTATESIZE (28) +#elif defined(CONFIG_M68060) +#define FPSTATESIZE (12) #else -#include "fpu_mm.h" +#define FPSTATESIZE (0) #endif + +#endif /* __M68K_FPU_H */ diff --git a/arch/m68k/include/asm/fpu_mm.h b/arch/m68k/include/asm/fpu_mm.h deleted file mode 100644 index ffb6b8cfc6d5..000000000000 --- a/arch/m68k/include/asm/fpu_mm.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __M68K_FPU_H -#define __M68K_FPU_H - - -/* - * MAX floating point unit state size (FSAVE/FRESTORE) - */ - -#if defined(CONFIG_M68020) || defined(CONFIG_M68030) -#define FPSTATESIZE (216) -#elif defined(CONFIG_M68040) -#define FPSTATESIZE (96) -#elif defined(CONFIG_M68KFPU_EMU) -#define FPSTATESIZE (28) -#elif defined(CONFIG_M68060) -#define FPSTATESIZE (12) -#else -#define FPSTATESIZE (0) -#endif - -#endif /* __M68K_FPU_H */ diff --git a/arch/m68k/include/asm/fpu_no.h b/arch/m68k/include/asm/fpu_no.h deleted file mode 100644 index b16b2e4fca2a..000000000000 --- a/arch/m68k/include/asm/fpu_no.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __M68KNOMMU_FPU_H -#define __M68KNOMMU_FPU_H - - -/* - * MAX floating point unit state size (FSAVE/FRESTORE) - */ -#if defined(CONFIG_M68020) || defined(CONFIG_M68030) -#define FPSTATESIZE (216/sizeof(unsigned char)) -#elif defined(CONFIG_M68040) -#define FPSTATESIZE (96/sizeof(unsigned char)) -#elif defined(CONFIG_M68KFPU_EMU) -#define FPSTATESIZE (28/sizeof(unsigned char)) -#elif defined(CONFIG_M68060) -#define FPSTATESIZE (12/sizeof(unsigned char)) -#else -/* Assume no FP unit present then... */ -#define FPSTATESIZE (2) /* dummy size */ -#endif - -#endif /* __M68K_FPU_H */ -- cgit v1.2.3-59-g8ed1b From ebafc17468d58bd903c886175ca84a4edc69ae1d Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 16:28:11 +1000 Subject: m68k: use mmu timex.h for non-mmu setups as well The non-mmu timex.h can be cleaned up and ends up being identical to the mmu timex.h, just just use that. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/timex.h | 21 +++++++++++++++++---- arch/m68k/include/asm/timex_mm.h | 18 ------------------ arch/m68k/include/asm/timex_no.h | 23 ----------------------- 3 files changed, 17 insertions(+), 45 deletions(-) delete mode 100644 arch/m68k/include/asm/timex_mm.h delete mode 100644 arch/m68k/include/asm/timex_no.h diff --git a/arch/m68k/include/asm/timex.h b/arch/m68k/include/asm/timex.h index 719762980578..b87f2f278f67 100644 --- a/arch/m68k/include/asm/timex.h +++ b/arch/m68k/include/asm/timex.h @@ -1,5 +1,18 @@ -#ifdef __uClinux__ -#include "timex_no.h" -#else -#include "timex_mm.h" +/* + * linux/include/asm-m68k/timex.h + * + * m68k architecture timex specifications + */ +#ifndef _ASMm68k_TIMEX_H +#define _ASMm68k_TIMEX_H + +#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ + +typedef unsigned long cycles_t; + +static inline cycles_t get_cycles(void) +{ + return 0; +} + #endif diff --git a/arch/m68k/include/asm/timex_mm.h b/arch/m68k/include/asm/timex_mm.h deleted file mode 100644 index b87f2f278f67..000000000000 --- a/arch/m68k/include/asm/timex_mm.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * linux/include/asm-m68k/timex.h - * - * m68k architecture timex specifications - */ -#ifndef _ASMm68k_TIMEX_H -#define _ASMm68k_TIMEX_H - -#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ - -typedef unsigned long cycles_t; - -static inline cycles_t get_cycles(void) -{ - return 0; -} - -#endif diff --git a/arch/m68k/include/asm/timex_no.h b/arch/m68k/include/asm/timex_no.h deleted file mode 100644 index 109050f3fe91..000000000000 --- a/arch/m68k/include/asm/timex_no.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * linux/include/asm-m68knommu/timex.h - * - * m68knommu architecture timex specifications - */ -#ifndef _ASM_M68KNOMMU_TIMEX_H -#define _ASM_M68KNOMMU_TIMEX_H - -#ifdef CONFIG_COLDFIRE -#include -#define CLOCK_TICK_RATE MCF_CLK -#else -#define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -#endif - -typedef unsigned long cycles_t; - -static inline cycles_t get_cycles(void) -{ - return 0; -} - -#endif -- cgit v1.2.3-59-g8ed1b From f358cbcd49027d3a58c1f29dbf02a8369bfedcf7 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 16:36:43 +1000 Subject: m68k: use mmu version of elf.h for non-mmu builds as well Nothing specificly needed to support non-mmu m68k in elf.h, so just use the mmu one. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/elf.h | 120 +++++++++++++++++++++++++++++++++++++++-- arch/m68k/include/asm/elf_mm.h | 119 ---------------------------------------- arch/m68k/include/asm/elf_no.h | 110 ------------------------------------- 3 files changed, 117 insertions(+), 232 deletions(-) delete mode 100644 arch/m68k/include/asm/elf_mm.h delete mode 100644 arch/m68k/include/asm/elf_no.h diff --git a/arch/m68k/include/asm/elf.h b/arch/m68k/include/asm/elf.h index 04ce488bc63f..0b0f49eb876b 100644 --- a/arch/m68k/include/asm/elf.h +++ b/arch/m68k/include/asm/elf.h @@ -1,5 +1,119 @@ -#ifdef __uClinux__ -#include "elf_no.h" +#ifndef __ASMm68k_ELF_H +#define __ASMm68k_ELF_H + +/* + * ELF register definitions.. + */ + +#include +#include + +/* + * 68k ELF relocation types + */ +#define R_68K_NONE 0 +#define R_68K_32 1 +#define R_68K_16 2 +#define R_68K_8 3 +#define R_68K_PC32 4 +#define R_68K_PC16 5 +#define R_68K_PC8 6 +#define R_68K_GOT32 7 +#define R_68K_GOT16 8 +#define R_68K_GOT8 9 +#define R_68K_GOT32O 10 +#define R_68K_GOT16O 11 +#define R_68K_GOT8O 12 +#define R_68K_PLT32 13 +#define R_68K_PLT16 14 +#define R_68K_PLT8 15 +#define R_68K_PLT32O 16 +#define R_68K_PLT16O 17 +#define R_68K_PLT8O 18 +#define R_68K_COPY 19 +#define R_68K_GLOB_DAT 20 +#define R_68K_JMP_SLOT 21 +#define R_68K_RELATIVE 22 + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +typedef struct user_m68kfp_struct elf_fpregset_t; + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) ((x)->e_machine == EM_68K) + +/* + * These are used to set parameters in the core dumps. + */ +#define ELF_CLASS ELFCLASS32 +#define ELF_DATA ELFDATA2MSB +#define ELF_ARCH EM_68K + +/* For SVR4/m68k the function pointer to be registered with `atexit' is + passed in %a1. Although my copy of the ABI has no such statement, it + is actually used on ASV. */ +#define ELF_PLAT_INIT(_r, load_addr) _r->a1 = 0 + +#define USE_ELF_CORE_DUMP +#ifndef CONFIG_SUN3 +#define ELF_EXEC_PAGESIZE 4096 #else -#include "elf_mm.h" +#define ELF_EXEC_PAGESIZE 8192 +#endif + +/* This is the location that an ET_DYN program is loaded if exec'ed. Typical + use of this is to invoke "./ld.so someprog" to test out a new version of + the loader. We need to make sure that it is out of the way of the program + that it will "exec", and that there is sufficient room for the brk. */ + +#ifndef CONFIG_SUN3 +#define ELF_ET_DYN_BASE 0xD0000000UL +#else +#define ELF_ET_DYN_BASE 0x0D800000UL +#endif + +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ + /* Bleech. */ \ + pr_reg[0] = regs->d1; \ + pr_reg[1] = regs->d2; \ + pr_reg[2] = regs->d3; \ + pr_reg[3] = regs->d4; \ + pr_reg[4] = regs->d5; \ + pr_reg[7] = regs->a0; \ + pr_reg[8] = regs->a1; \ + pr_reg[9] = regs->a2; \ + pr_reg[14] = regs->d0; \ + pr_reg[15] = rdusp(); \ + pr_reg[16] = regs->orig_d0; \ + pr_reg[17] = regs->sr; \ + pr_reg[18] = regs->pc; \ + pr_reg[19] = (regs->format << 12) | regs->vector; \ + { \ + struct switch_stack *sw = ((struct switch_stack *)regs) - 1; \ + pr_reg[5] = sw->d6; \ + pr_reg[6] = sw->d7; \ + pr_reg[10] = sw->a3; \ + pr_reg[11] = sw->a4; \ + pr_reg[12] = sw->a5; \ + pr_reg[13] = sw->a6; \ + } + +/* This yields a mask that user programs can use to figure out what + instruction set this cpu supports. */ + +#define ELF_HWCAP (0) + +/* This yields a string that ld.so will use to load implementation + specific libraries for optimization. This is more specific in + intent than poking at uname or /proc/cpuinfo. */ + +#define ELF_PLATFORM (NULL) + +#define SET_PERSONALITY(ex) set_personality(PER_LINUX) + #endif diff --git a/arch/m68k/include/asm/elf_mm.h b/arch/m68k/include/asm/elf_mm.h deleted file mode 100644 index 0b0f49eb876b..000000000000 --- a/arch/m68k/include/asm/elf_mm.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef __ASMm68k_ELF_H -#define __ASMm68k_ELF_H - -/* - * ELF register definitions.. - */ - -#include -#include - -/* - * 68k ELF relocation types - */ -#define R_68K_NONE 0 -#define R_68K_32 1 -#define R_68K_16 2 -#define R_68K_8 3 -#define R_68K_PC32 4 -#define R_68K_PC16 5 -#define R_68K_PC8 6 -#define R_68K_GOT32 7 -#define R_68K_GOT16 8 -#define R_68K_GOT8 9 -#define R_68K_GOT32O 10 -#define R_68K_GOT16O 11 -#define R_68K_GOT8O 12 -#define R_68K_PLT32 13 -#define R_68K_PLT16 14 -#define R_68K_PLT8 15 -#define R_68K_PLT32O 16 -#define R_68K_PLT16O 17 -#define R_68K_PLT8O 18 -#define R_68K_COPY 19 -#define R_68K_GLOB_DAT 20 -#define R_68K_JMP_SLOT 21 -#define R_68K_RELATIVE 22 - -typedef unsigned long elf_greg_t; - -#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef struct user_m68kfp_struct elf_fpregset_t; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -#define elf_check_arch(x) ((x)->e_machine == EM_68K) - -/* - * These are used to set parameters in the core dumps. - */ -#define ELF_CLASS ELFCLASS32 -#define ELF_DATA ELFDATA2MSB -#define ELF_ARCH EM_68K - -/* For SVR4/m68k the function pointer to be registered with `atexit' is - passed in %a1. Although my copy of the ABI has no such statement, it - is actually used on ASV. */ -#define ELF_PLAT_INIT(_r, load_addr) _r->a1 = 0 - -#define USE_ELF_CORE_DUMP -#ifndef CONFIG_SUN3 -#define ELF_EXEC_PAGESIZE 4096 -#else -#define ELF_EXEC_PAGESIZE 8192 -#endif - -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#ifndef CONFIG_SUN3 -#define ELF_ET_DYN_BASE 0xD0000000UL -#else -#define ELF_ET_DYN_BASE 0x0D800000UL -#endif - -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - /* Bleech. */ \ - pr_reg[0] = regs->d1; \ - pr_reg[1] = regs->d2; \ - pr_reg[2] = regs->d3; \ - pr_reg[3] = regs->d4; \ - pr_reg[4] = regs->d5; \ - pr_reg[7] = regs->a0; \ - pr_reg[8] = regs->a1; \ - pr_reg[9] = regs->a2; \ - pr_reg[14] = regs->d0; \ - pr_reg[15] = rdusp(); \ - pr_reg[16] = regs->orig_d0; \ - pr_reg[17] = regs->sr; \ - pr_reg[18] = regs->pc; \ - pr_reg[19] = (regs->format << 12) | regs->vector; \ - { \ - struct switch_stack *sw = ((struct switch_stack *)regs) - 1; \ - pr_reg[5] = sw->d6; \ - pr_reg[6] = sw->d7; \ - pr_reg[10] = sw->a3; \ - pr_reg[11] = sw->a4; \ - pr_reg[12] = sw->a5; \ - pr_reg[13] = sw->a6; \ - } - -/* This yields a mask that user programs can use to figure out what - instruction set this cpu supports. */ - -#define ELF_HWCAP (0) - -/* This yields a string that ld.so will use to load implementation - specific libraries for optimization. This is more specific in - intent than poking at uname or /proc/cpuinfo. */ - -#define ELF_PLATFORM (NULL) - -#define SET_PERSONALITY(ex) set_personality(PER_LINUX) - -#endif diff --git a/arch/m68k/include/asm/elf_no.h b/arch/m68k/include/asm/elf_no.h deleted file mode 100644 index b8046837f384..000000000000 --- a/arch/m68k/include/asm/elf_no.h +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef __ASMm68k_ELF_H -#define __ASMm68k_ELF_H - -/* - * ELF register definitions.. - */ - -#include -#include - -/* - * 68k ELF relocation types - */ -#define R_68K_NONE 0 -#define R_68K_32 1 -#define R_68K_16 2 -#define R_68K_8 3 -#define R_68K_PC32 4 -#define R_68K_PC16 5 -#define R_68K_PC8 6 -#define R_68K_GOT32 7 -#define R_68K_GOT16 8 -#define R_68K_GOT8 9 -#define R_68K_GOT32O 10 -#define R_68K_GOT16O 11 -#define R_68K_GOT8O 12 -#define R_68K_PLT32 13 -#define R_68K_PLT16 14 -#define R_68K_PLT8 15 -#define R_68K_PLT32O 16 -#define R_68K_PLT16O 17 -#define R_68K_PLT8O 18 -#define R_68K_COPY 19 -#define R_68K_GLOB_DAT 20 -#define R_68K_JMP_SLOT 21 -#define R_68K_RELATIVE 22 - -typedef unsigned long elf_greg_t; - -#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t)) -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef struct user_m68kfp_struct elf_fpregset_t; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -#define elf_check_arch(x) ((x)->e_machine == EM_68K) - -/* - * These are used to set parameters in the core dumps. - */ -#define ELF_CLASS ELFCLASS32 -#define ELF_DATA ELFDATA2MSB -#define ELF_ARCH EM_68K - -/* For SVR4/m68k the function pointer to be registered with `atexit' is - passed in %a1. Although my copy of the ABI has no such statement, it - is actually used on ASV. */ -#define ELF_PLAT_INIT(_r, load_addr) _r->a1 = 0 - -#define USE_ELF_CORE_DUMP -#define ELF_EXEC_PAGESIZE 4096 - -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE 0xD0000000UL - -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - /* Bleech. */ \ - pr_reg[0] = regs->d1; \ - pr_reg[1] = regs->d2; \ - pr_reg[2] = regs->d3; \ - pr_reg[3] = regs->d4; \ - pr_reg[4] = regs->d5; \ - pr_reg[7] = regs->a0; \ - pr_reg[8] = regs->a1; \ - pr_reg[14] = regs->d0; \ - pr_reg[15] = rdusp(); \ - pr_reg[16] = 0 /* regs->orig_d0 */; \ - pr_reg[17] = regs->sr; \ - pr_reg[18] = regs->pc; \ - /* pr_reg[19] = (regs->format << 12) | regs->vector; */ \ - { \ - struct switch_stack *sw = ((struct switch_stack *)regs) - 1; \ - pr_reg[5] = sw->d6; \ - pr_reg[6] = sw->d7; \ - pr_reg[10] = sw->a3; \ - pr_reg[11] = sw->a4; \ - pr_reg[12] = sw->a5; \ - pr_reg[13] = sw->a6; \ - } - -/* This yields a mask that user programs can use to figure out what - instruction set this cpu supports. */ - -#define ELF_HWCAP (0) - -/* This yields a string that ld.so will use to load implementation - specific libraries for optimization. This is more specific in - intent than poking at uname or /proc/cpuinfo. */ - -#define ELF_PLATFORM (NULL) - -#define SET_PERSONALITY(ex) set_personality(PER_LINUX) - -#endif -- cgit v1.2.3-59-g8ed1b From af85fe9e5e618de205ac6944ed9fe45ade0874bf Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 16:50:22 +1000 Subject: m68k: use non-mmu version of unaligned.h for all m68k The non-mmu version is appropriately ifdef'ed to be used "as is" on all m68k varients. So switch to it as the only unaligned.h. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/unaligned.h | 26 +++++++++++++++++++++++--- arch/m68k/include/asm/unaligned_mm.h | 13 ------------- arch/m68k/include/asm/unaligned_no.h | 25 ------------------------- 3 files changed, 23 insertions(+), 41 deletions(-) delete mode 100644 arch/m68k/include/asm/unaligned_mm.h delete mode 100644 arch/m68k/include/asm/unaligned_no.h diff --git a/arch/m68k/include/asm/unaligned.h b/arch/m68k/include/asm/unaligned.h index c640bba3bdf4..eb1ea4cb9a59 100644 --- a/arch/m68k/include/asm/unaligned.h +++ b/arch/m68k/include/asm/unaligned.h @@ -1,5 +1,25 @@ -#ifdef __uClinux__ -#include "unaligned_no.h" +#ifndef _ASM_M68KNOMMU_UNALIGNED_H +#define _ASM_M68KNOMMU_UNALIGNED_H + + +#ifdef CONFIG_COLDFIRE +#include +#include +#include + +#define get_unaligned __get_unaligned_be +#define put_unaligned __put_unaligned_be + #else -#include "unaligned_mm.h" +/* + * The m68k can do unaligned accesses itself. + */ +#include +#include + +#define get_unaligned __get_unaligned_be +#define put_unaligned __put_unaligned_be + #endif + +#endif /* _ASM_M68KNOMMU_UNALIGNED_H */ diff --git a/arch/m68k/include/asm/unaligned_mm.h b/arch/m68k/include/asm/unaligned_mm.h deleted file mode 100644 index 77698f2dc33c..000000000000 --- a/arch/m68k/include/asm/unaligned_mm.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_M68K_UNALIGNED_H -#define _ASM_M68K_UNALIGNED_H - -/* - * The m68k can do unaligned accesses itself. - */ -#include -#include - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_M68K_UNALIGNED_H */ diff --git a/arch/m68k/include/asm/unaligned_no.h b/arch/m68k/include/asm/unaligned_no.h deleted file mode 100644 index eb1ea4cb9a59..000000000000 --- a/arch/m68k/include/asm/unaligned_no.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _ASM_M68KNOMMU_UNALIGNED_H -#define _ASM_M68KNOMMU_UNALIGNED_H - - -#ifdef CONFIG_COLDFIRE -#include -#include -#include - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#else -/* - * The m68k can do unaligned accesses itself. - */ -#include -#include - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif - -#endif /* _ASM_M68KNOMMU_UNALIGNED_H */ -- cgit v1.2.3-59-g8ed1b From e77d15511fc4e1554c500524dd43b9c162a2d95e Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 29 Jan 2009 17:09:51 +1000 Subject: m68k: the one hw_irq.h can be used buy all m68k The mmu and non-mmu hw_irq.h are identical, revert to a single copy. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/hw_irq.h | 9 +++++---- arch/m68k/include/asm/hw_irq_mm.h | 6 ------ arch/m68k/include/asm/hw_irq_no.h | 4 ---- 3 files changed, 5 insertions(+), 14 deletions(-) delete mode 100644 arch/m68k/include/asm/hw_irq_mm.h delete mode 100644 arch/m68k/include/asm/hw_irq_no.h diff --git a/arch/m68k/include/asm/hw_irq.h b/arch/m68k/include/asm/hw_irq.h index e19526015890..eacef0951fbf 100644 --- a/arch/m68k/include/asm/hw_irq.h +++ b/arch/m68k/include/asm/hw_irq.h @@ -1,5 +1,6 @@ -#ifdef __uClinux__ -#include "hw_irq_no.h" -#else -#include "hw_irq_mm.h" +#ifndef __ASM_M68K_HW_IRQ_H +#define __ASM_M68K_HW_IRQ_H + +/* Dummy include. */ + #endif diff --git a/arch/m68k/include/asm/hw_irq_mm.h b/arch/m68k/include/asm/hw_irq_mm.h deleted file mode 100644 index eacef0951fbf..000000000000 --- a/arch/m68k/include/asm/hw_irq_mm.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_M68K_HW_IRQ_H -#define __ASM_M68K_HW_IRQ_H - -/* Dummy include. */ - -#endif diff --git a/arch/m68k/include/asm/hw_irq_no.h b/arch/m68k/include/asm/hw_irq_no.h deleted file mode 100644 index f3ec9e5ae049..000000000000 --- a/arch/m68k/include/asm/hw_irq_no.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __M68KNOMMU_HW_IRQ_H__ -#define __M68KNOMMU_HW_IRQ_H__ - -#endif /* __M68KNOMMU_HW_IRQ_H__ */ -- cgit v1.2.3-59-g8ed1b From ac55cdfb02133df64d4aa5571b343d4e98673b07 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Mon, 2 Feb 2009 16:27:52 +1000 Subject: m68k: merge the mmu and non-mmu versions of page_offset.h Trivial merge of the mmu and non-mmu versions of page_offset.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/page_offset.h | 12 +++++++++--- arch/m68k/include/asm/page_offset_mm.h | 8 -------- arch/m68k/include/asm/page_offset_no.h | 5 ----- 3 files changed, 9 insertions(+), 16 deletions(-) delete mode 100644 arch/m68k/include/asm/page_offset_mm.h delete mode 100644 arch/m68k/include/asm/page_offset_no.h diff --git a/arch/m68k/include/asm/page_offset.h b/arch/m68k/include/asm/page_offset.h index 66455c849fbb..1780152d81da 100644 --- a/arch/m68k/include/asm/page_offset.h +++ b/arch/m68k/include/asm/page_offset.h @@ -1,5 +1,11 @@ -#ifdef __uClinux__ -#include "page_offset_no.h" +/* This handles the memory map.. */ + +#ifdef CONFIG_MMU +#ifndef CONFIG_SUN3 +#define PAGE_OFFSET_RAW 0x00000000 #else -#include "page_offset_mm.h" +#define PAGE_OFFSET_RAW 0x0E000000 +#endif +#else +#define PAGE_OFFSET_RAW CONFIG_RAMBASE #endif diff --git a/arch/m68k/include/asm/page_offset_mm.h b/arch/m68k/include/asm/page_offset_mm.h deleted file mode 100644 index 1cbdb7f30ac2..000000000000 --- a/arch/m68k/include/asm/page_offset_mm.h +++ /dev/null @@ -1,8 +0,0 @@ - -/* This handles the memory map.. */ -#ifndef CONFIG_SUN3 -#define PAGE_OFFSET_RAW 0x00000000 -#else -#define PAGE_OFFSET_RAW 0x0E000000 -#endif - diff --git a/arch/m68k/include/asm/page_offset_no.h b/arch/m68k/include/asm/page_offset_no.h deleted file mode 100644 index d4e73e0ba646..000000000000 --- a/arch/m68k/include/asm/page_offset_no.h +++ /dev/null @@ -1,5 +0,0 @@ - - -/* This handles the memory map.. */ -#define PAGE_OFFSET_RAW CONFIG_RAMBASE - -- cgit v1.2.3-59-g8ed1b From c8bd42707fc1d8e72e0cc722bdc0c0975b52d958 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 5 Feb 2009 09:58:44 +1000 Subject: m68knommu: mv definition of check_pgt_cache() Move the definition of check_pgt_cache() to be consistent with where m68k defines it. (Will make merging of these headers easier later on). Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/pgalloc_no.h | 2 -- arch/m68k/include/asm/pgtable_no.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68k/include/asm/pgalloc_no.h b/arch/m68k/include/asm/pgalloc_no.h index d6352f671ec0..a79ee0e163dc 100644 --- a/arch/m68k/include/asm/pgalloc_no.h +++ b/arch/m68k/include/asm/pgalloc_no.h @@ -3,6 +3,4 @@ #include -#define check_pgt_cache() do { } while (0) - #endif /* _M68KNOMMU_PGALLOC_H */ diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index 46251016e821..bf86b29fe64a 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -67,4 +67,6 @@ extern unsigned int kobjsize(const void *objp); #include +#define check_pgt_cache() do { } while (0) + #endif /* _M68KNOMMU_PGTABLE_H */ -- cgit v1.2.3-59-g8ed1b From 606333d64e06db9ae14b5a025380bc5954b1a6c4 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 5 Feb 2009 10:01:00 +1000 Subject: m68k: merge the mmu and non-mmu versions of pgalloc.h Trivial merge of the mmu and non-mmu version of pgalloc.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/pgalloc.h | 20 +++++++++++++++++--- arch/m68k/include/asm/pgalloc_mm.h | 19 ------------------- arch/m68k/include/asm/pgalloc_no.h | 6 ------ 3 files changed, 17 insertions(+), 28 deletions(-) delete mode 100644 arch/m68k/include/asm/pgalloc_mm.h delete mode 100644 arch/m68k/include/asm/pgalloc_no.h diff --git a/arch/m68k/include/asm/pgalloc.h b/arch/m68k/include/asm/pgalloc.h index 059cb73e78fc..c294aad8a900 100644 --- a/arch/m68k/include/asm/pgalloc.h +++ b/arch/m68k/include/asm/pgalloc.h @@ -1,5 +1,19 @@ -#ifdef __uClinux__ -#include "pgalloc_no.h" +#ifndef M68K_PGALLOC_H +#define M68K_PGALLOC_H + +#include +#include +#include + +#ifdef CONFIG_MMU +#include +#ifdef CONFIG_SUN3 +#include #else -#include "pgalloc_mm.h" +#include #endif + +extern void m68k_setup_node(int node); +#endif + +#endif /* M68K_PGALLOC_H */ diff --git a/arch/m68k/include/asm/pgalloc_mm.h b/arch/m68k/include/asm/pgalloc_mm.h deleted file mode 100644 index 4cb1a57ab763..000000000000 --- a/arch/m68k/include/asm/pgalloc_mm.h +++ /dev/null @@ -1,19 +0,0 @@ - -#ifndef M68K_PGALLOC_H -#define M68K_PGALLOC_H - -#include -#include -#include -#include - - -#ifdef CONFIG_SUN3 -#include -#else -#include -#endif - -extern void m68k_setup_node(int node); - -#endif /* M68K_PGALLOC_H */ diff --git a/arch/m68k/include/asm/pgalloc_no.h b/arch/m68k/include/asm/pgalloc_no.h deleted file mode 100644 index a79ee0e163dc..000000000000 --- a/arch/m68k/include/asm/pgalloc_no.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _M68KNOMMU_PGALLOC_H -#define _M68KNOMMU_PGALLOC_H - -#include - -#endif /* _M68KNOMMU_PGALLOC_H */ -- cgit v1.2.3-59-g8ed1b From 15ee04c288d8d612d93a14decc968e817d21e789 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Feb 2009 10:05:25 +1000 Subject: m68k: Use dma_addr_t for scatterlist.dma_address dma_addr_t (as was used by m68knommu) is more correct than u32. Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/scatterlist.h b/arch/m68k/include/asm/scatterlist.h index d3a7a0edfeca..e27ad902b1cf 100644 --- a/arch/m68k/include/asm/scatterlist.h +++ b/arch/m68k/include/asm/scatterlist.h @@ -11,7 +11,7 @@ struct scatterlist { unsigned int offset; unsigned int length; - __u32 dma_address; /* A place to hang host-specific addresses at. */ + dma_addr_t dma_address; /* A place to hang host-specific addresses at. */ }; /* This is bogus and should go away. */ -- cgit v1.2.3-59-g8ed1b From c315bd5fdd2a949a9ccc22b2c64ee694602682d4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Feb 2009 10:07:46 +1000 Subject: m68k: Restore correct include guards for Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/unaligned.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/m68k/include/asm/unaligned.h b/arch/m68k/include/asm/unaligned.h index eb1ea4cb9a59..019caa740c21 100644 --- a/arch/m68k/include/asm/unaligned.h +++ b/arch/m68k/include/asm/unaligned.h @@ -1,5 +1,5 @@ -#ifndef _ASM_M68KNOMMU_UNALIGNED_H -#define _ASM_M68KNOMMU_UNALIGNED_H +#ifndef _ASM_M68K_UNALIGNED_H +#define _ASM_M68K_UNALIGNED_H #ifdef CONFIG_COLDFIRE @@ -22,4 +22,4 @@ #endif -#endif /* _ASM_M68KNOMMU_UNALIGNED_H */ +#endif /* _ASM_M68K_UNALIGNED_H */ -- cgit v1.2.3-59-g8ed1b From 4330e179a96bc9310d36e9b858bc8f275f329312 Mon Sep 17 00:00:00 2001 From: Len Sorensen Date: Thu, 5 Feb 2009 10:11:24 +1000 Subject: m68knommu: Fix support for console port other than ttyS0 on mcf.c Due to a case of backwards logic, mfc.c always makes the console port be ttyS0 even when you ask for another port. This patch fixes this issue. Only when the requested port is NOT in the range 0 to MAXPORTS-1 do we force it to be treated as if port 0 was requested. Forcing the port to 0 when it is in fact in the range 0 to MAXPORTS is not helpful. Tested with working console on ttyS2 on a 5271evb. Signed-off-by: Len Sorensen Signed-off-by: Greg Ungerer --- drivers/serial/mcf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c index 56841fe5f483..0eefb07bebaf 100644 --- a/drivers/serial/mcf.c +++ b/drivers/serial/mcf.c @@ -513,7 +513,7 @@ static int __init mcf_console_setup(struct console *co, char *options) int parity = 'n'; int flow = 'n'; - if ((co->index >= 0) && (co->index <= MCF_MAXPORTS)) + if ((co->index < 0) || (co->index >= MCF_MAXPORTS)) co->index = 0; port = &mcf_ports[co->index].port; if (port->membase == 0) -- cgit v1.2.3-59-g8ed1b From 872065e78c0dff5a384128ea198b34f72e9d5c3a Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 13:55:16 +1000 Subject: m68k: merge the mmu and non-mmu versions of ucontext.h The non-mmu m68k setups can use the mm ucontext.h with no change. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/ucontext.h | 33 +++++++++++++++++++++++++++++---- arch/m68k/include/asm/ucontext_mm.h | 30 ------------------------------ arch/m68k/include/asm/ucontext_no.h | 32 -------------------------------- 3 files changed, 29 insertions(+), 66 deletions(-) delete mode 100644 arch/m68k/include/asm/ucontext_mm.h delete mode 100644 arch/m68k/include/asm/ucontext_no.h diff --git a/arch/m68k/include/asm/ucontext.h b/arch/m68k/include/asm/ucontext.h index b53cd160c0b3..e4e22669edc0 100644 --- a/arch/m68k/include/asm/ucontext.h +++ b/arch/m68k/include/asm/ucontext.h @@ -1,5 +1,30 @@ -#ifdef __uClinux__ -#include "ucontext_no.h" -#else -#include "ucontext_mm.h" +#ifndef _M68K_UCONTEXT_H +#define _M68K_UCONTEXT_H + +typedef int greg_t; +#define NGREG 18 +typedef greg_t gregset_t[NGREG]; + +typedef struct fpregset { + int f_fpcntl[3]; + int f_fpregs[8*3]; +} fpregset_t; + +struct mcontext { + int version; + gregset_t gregs; + fpregset_t fpregs; +}; + +#define MCONTEXT_VERSION 2 + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + struct mcontext uc_mcontext; + unsigned long uc_filler[80]; + sigset_t uc_sigmask; /* mask last for extensibility */ +}; + #endif diff --git a/arch/m68k/include/asm/ucontext_mm.h b/arch/m68k/include/asm/ucontext_mm.h deleted file mode 100644 index e4e22669edc0..000000000000 --- a/arch/m68k/include/asm/ucontext_mm.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _M68K_UCONTEXT_H -#define _M68K_UCONTEXT_H - -typedef int greg_t; -#define NGREG 18 -typedef greg_t gregset_t[NGREG]; - -typedef struct fpregset { - int f_fpcntl[3]; - int f_fpregs[8*3]; -} fpregset_t; - -struct mcontext { - int version; - gregset_t gregs; - fpregset_t fpregs; -}; - -#define MCONTEXT_VERSION 2 - -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct mcontext uc_mcontext; - unsigned long uc_filler[80]; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; - -#endif diff --git a/arch/m68k/include/asm/ucontext_no.h b/arch/m68k/include/asm/ucontext_no.h deleted file mode 100644 index 713a27f901cd..000000000000 --- a/arch/m68k/include/asm/ucontext_no.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _M68KNOMMU_UCONTEXT_H -#define _M68KNOMMU_UCONTEXT_H - -typedef int greg_t; -#define NGREG 18 -typedef greg_t gregset_t[NGREG]; - -typedef struct fpregset { - int f_pcr; - int f_psr; - int f_fpiaddr; - int f_fpregs[8][3]; -} fpregset_t; - -struct mcontext { - int version; - gregset_t gregs; - fpregset_t fpregs; -}; - -#define MCONTEXT_VERSION 2 - -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct mcontext uc_mcontext; - unsigned long uc_filler[80]; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; - -#endif -- cgit v1.2.3-59-g8ed1b From 4892242a47fe21724ee6a98d71d9c512716ea89d Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:14:29 +1000 Subject: m68k: merge the mmu and non-mmu versions of segment.h Trivial merge of the mmu and non-mmu version of segment.h. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/segment.h | 64 ++++++++++++++++++++++++++++++++++++-- arch/m68k/include/asm/segment_mm.h | 57 --------------------------------- arch/m68k/include/asm/segment_no.h | 51 ------------------------------ 3 files changed, 61 insertions(+), 111 deletions(-) delete mode 100644 arch/m68k/include/asm/segment_mm.h delete mode 100644 arch/m68k/include/asm/segment_no.h diff --git a/arch/m68k/include/asm/segment.h b/arch/m68k/include/asm/segment.h index 82583bc004bd..ee959219fdfe 100644 --- a/arch/m68k/include/asm/segment.h +++ b/arch/m68k/include/asm/segment.h @@ -1,5 +1,63 @@ -#ifdef __uClinux__ -#include "segment_no.h" +#ifndef _M68K_SEGMENT_H +#define _M68K_SEGMENT_H + +/* define constants */ +/* Address spaces (FC0-FC2) */ +#define USER_DATA (1) +#ifndef __USER_DS +#define __USER_DS (USER_DATA) +#endif +#define USER_PROGRAM (2) +#define SUPER_DATA (5) +#ifndef __KERNEL_DS +#define __KERNEL_DS (SUPER_DATA) +#endif +#define SUPER_PROGRAM (6) +#define CPU_SPACE (7) + +#ifndef __ASSEMBLY__ + +typedef struct { + unsigned long seg; +} mm_segment_t; + +#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) +#define USER_DS MAKE_MM_SEG(__USER_DS) +#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) + +/* + * Get/set the SFC/DFC registers for MOVES instructions + */ + +static inline mm_segment_t get_fs(void) +{ +#ifdef CONFIG_MMU + mm_segment_t _v; + __asm__ ("movec %/dfc,%0":"=r" (_v.seg):); + + return _v; #else -#include "segment_mm.h" + return USER_DS; +#endif +} + +static inline mm_segment_t get_ds(void) +{ + /* return the supervisor data space code */ + return KERNEL_DS; +} + +static inline void set_fs(mm_segment_t val) +{ +#ifdef CONFIG_MMU + __asm__ __volatile__ ("movec %0,%/sfc\n\t" + "movec %0,%/dfc\n\t" + : /* no outputs */ : "r" (val.seg) : "memory"); #endif +} + +#define segment_eq(a,b) ((a).seg == (b).seg) + +#endif /* __ASSEMBLY__ */ + +#endif /* _M68K_SEGMENT_H */ diff --git a/arch/m68k/include/asm/segment_mm.h b/arch/m68k/include/asm/segment_mm.h deleted file mode 100644 index 7b0b2d3127f9..000000000000 --- a/arch/m68k/include/asm/segment_mm.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef _M68K_SEGMENT_H -#define _M68K_SEGMENT_H - -/* define constants */ -/* Address spaces (FC0-FC2) */ -#define USER_DATA (1) -#ifndef __USER_DS -#define __USER_DS (USER_DATA) -#endif -#define USER_PROGRAM (2) -#define SUPER_DATA (5) -#ifndef __KERNEL_DS -#define __KERNEL_DS (SUPER_DATA) -#endif -#define SUPER_PROGRAM (6) -#define CPU_SPACE (7) - -#ifndef __ASSEMBLY__ - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -#define USER_DS MAKE_MM_SEG(__USER_DS) -#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) - -/* - * Get/set the SFC/DFC registers for MOVES instructions - */ - -static inline mm_segment_t get_fs(void) -{ - mm_segment_t _v; - __asm__ ("movec %/dfc,%0":"=r" (_v.seg):); - - return _v; -} - -static inline mm_segment_t get_ds(void) -{ - /* return the supervisor data space code */ - return KERNEL_DS; -} - -static inline void set_fs(mm_segment_t val) -{ - __asm__ __volatile__ ("movec %0,%/sfc\n\t" - "movec %0,%/dfc\n\t" - : /* no outputs */ : "r" (val.seg) : "memory"); -} - -#define segment_eq(a,b) ((a).seg == (b).seg) - -#endif /* __ASSEMBLY__ */ - -#endif /* _M68K_SEGMENT_H */ diff --git a/arch/m68k/include/asm/segment_no.h b/arch/m68k/include/asm/segment_no.h deleted file mode 100644 index 42318ebec7ec..000000000000 --- a/arch/m68k/include/asm/segment_no.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _M68K_SEGMENT_H -#define _M68K_SEGMENT_H - -/* define constants */ -/* Address spaces (FC0-FC2) */ -#define USER_DATA (1) -#ifndef __USER_DS -#define __USER_DS (USER_DATA) -#endif -#define USER_PROGRAM (2) -#define SUPER_DATA (5) -#ifndef __KERNEL_DS -#define __KERNEL_DS (SUPER_DATA) -#endif -#define SUPER_PROGRAM (6) -#define CPU_SPACE (7) - -#ifndef __ASSEMBLY__ - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) -#define USER_DS MAKE_MM_SEG(__USER_DS) -#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) - -/* - * Get/set the SFC/DFC registers for MOVES instructions - */ - -static inline mm_segment_t get_fs(void) -{ - return USER_DS; -} - -static inline mm_segment_t get_ds(void) -{ - /* return the supervisor data space code */ - return KERNEL_DS; -} - -static inline void set_fs(mm_segment_t val) -{ -} - -#define segment_eq(a,b) ((a).seg == (b).seg) - -#endif /* __ASSEMBLY__ */ - -#endif /* _M68K_SEGMENT_H */ -- cgit v1.2.3-59-g8ed1b From 146b7cdba2c7afb14d965a5dc466a6532084042a Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:25:53 +1000 Subject: m68k: merge the mmu and non-mmu versions of fb.h Trivial merge of the mmu and non-mmu versions of fb.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/fb.h | 41 +++++++++++++++++++++++++++++++++++++---- arch/m68k/include/asm/fb_mm.h | 34 ---------------------------------- arch/m68k/include/asm/fb_no.h | 12 ------------ 3 files changed, 37 insertions(+), 50 deletions(-) delete mode 100644 arch/m68k/include/asm/fb_mm.h delete mode 100644 arch/m68k/include/asm/fb_no.h diff --git a/arch/m68k/include/asm/fb.h b/arch/m68k/include/asm/fb.h index 97bcaefd2064..be4e4c6797e8 100644 --- a/arch/m68k/include/asm/fb.h +++ b/arch/m68k/include/asm/fb.h @@ -1,5 +1,38 @@ -#ifdef __uClinux__ -#include "fb_no.h" +#ifndef _ASM_FB_H_ +#define _ASM_FB_H_ + +#include +#include +#include +#include + +#ifdef CONFIG_MMU +#ifdef CONFIG_SUN3 +static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, + unsigned long off) +{ + pgprot_val(vma->vm_page_prot) |= SUN3_PAGE_NOCACHE; +} #else -#include "fb_mm.h" -#endif +static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, + unsigned long off) +{ + if (CPU_IS_020_OR_030) + pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE030; + if (CPU_IS_040_OR_060) { + pgprot_val(vma->vm_page_prot) &= _CACHEMASK040; + /* Use no-cache mode, serialized */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE_S; + } +} +#endif /* CONFIG_SUN3 */ +#else +#define fb_pgprotect(...) do {} while (0) +#endif /* CONFIG_MMU */ + +static inline int fb_is_primary_device(struct fb_info *info) +{ + return 0; +} + +#endif /* _ASM_FB_H_ */ diff --git a/arch/m68k/include/asm/fb_mm.h b/arch/m68k/include/asm/fb_mm.h deleted file mode 100644 index 380b97ae8157..000000000000 --- a/arch/m68k/include/asm/fb_mm.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ - -#include -#include -#include -#include - -#ifdef CONFIG_SUN3 -static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, - unsigned long off) -{ - pgprot_val(vma->vm_page_prot) |= SUN3_PAGE_NOCACHE; -} -#else -static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, - unsigned long off) -{ - if (CPU_IS_020_OR_030) - pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE030; - if (CPU_IS_040_OR_060) { - pgprot_val(vma->vm_page_prot) &= _CACHEMASK040; - /* Use no-cache mode, serialized */ - pgprot_val(vma->vm_page_prot) |= _PAGE_NOCACHE_S; - } -} -#endif /* CONFIG_SUN3 */ - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ diff --git a/arch/m68k/include/asm/fb_no.h b/arch/m68k/include/asm/fb_no.h deleted file mode 100644 index c7df38030992..000000000000 --- a/arch/m68k/include/asm/fb_no.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_FB_H_ -#define _ASM_FB_H_ -#include - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H_ */ -- cgit v1.2.3-59-g8ed1b From de9f4fc2b7d73057cefa134901d8ee8e3f593f99 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:31:51 +1000 Subject: m68k: use the mmu version of bootinfo.h for m68knommu as well All m68k varients can use the same mmu version of bootinfo.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/bootinfo.h | 381 +++++++++++++++++++++++++++++++++++- arch/m68k/include/asm/bootinfo_mm.h | 378 ----------------------------------- arch/m68k/include/asm/bootinfo_no.h | 2 - 3 files changed, 377 insertions(+), 384 deletions(-) delete mode 100644 arch/m68k/include/asm/bootinfo_mm.h delete mode 100644 arch/m68k/include/asm/bootinfo_no.h diff --git a/arch/m68k/include/asm/bootinfo.h b/arch/m68k/include/asm/bootinfo.h index fedf3e326121..fb8a06b9ab6a 100644 --- a/arch/m68k/include/asm/bootinfo.h +++ b/arch/m68k/include/asm/bootinfo.h @@ -1,5 +1,378 @@ -#ifdef __uClinux__ -#include "bootinfo_no.h" -#else -#include "bootinfo_mm.h" +/* +** asm/bootinfo.h -- Definition of the Linux/m68k boot information structure +** +** Copyright 1992 by Greg Harp +** +** This file is subject to the terms and conditions of the GNU General Public +** License. See the file COPYING in the main directory of this archive +** for more details. +** +** Created 09/29/92 by Greg Harp +** +** 5/2/94 Roman Hodek: +** Added bi_atari part of the machine dependent union bi_un; for now it +** contains just a model field to distinguish between TT and Falcon. +** 26/7/96 Roman Zippel: +** Renamed to setup.h; added some useful macros to allow gcc some +** optimizations if possible. +** 5/10/96 Geert Uytterhoeven: +** Redesign of the boot information structure; renamed to bootinfo.h again +** 27/11/96 Geert Uytterhoeven: +** Backwards compatibility with bootinfo interface version 1.0 +*/ + +#ifndef _M68K_BOOTINFO_H +#define _M68K_BOOTINFO_H + + + /* + * Bootinfo definitions + * + * This is an easily parsable and extendable structure containing all + * information to be passed from the bootstrap to the kernel. + * + * This way I hope to keep all future changes back/forewards compatible. + * Thus, keep your fingers crossed... + * + * This structure is copied right after the kernel bss by the bootstrap + * routine. + */ + +#ifndef __ASSEMBLY__ + +struct bi_record { + unsigned short tag; /* tag ID */ + unsigned short size; /* size of record (in bytes) */ + unsigned long data[0]; /* data */ +}; + +#endif /* __ASSEMBLY__ */ + + + /* + * Tag Definitions + * + * Machine independent tags start counting from 0x0000 + * Machine dependent tags start counting from 0x8000 + */ + +#define BI_LAST 0x0000 /* last record (sentinel) */ +#define BI_MACHTYPE 0x0001 /* machine type (u_long) */ +#define BI_CPUTYPE 0x0002 /* cpu type (u_long) */ +#define BI_FPUTYPE 0x0003 /* fpu type (u_long) */ +#define BI_MMUTYPE 0x0004 /* mmu type (u_long) */ +#define BI_MEMCHUNK 0x0005 /* memory chunk address and size */ + /* (struct mem_info) */ +#define BI_RAMDISK 0x0006 /* ramdisk address and size */ + /* (struct mem_info) */ +#define BI_COMMAND_LINE 0x0007 /* kernel command line parameters */ + /* (string) */ + + /* + * Amiga-specific tags + */ + +#define BI_AMIGA_MODEL 0x8000 /* model (u_long) */ +#define BI_AMIGA_AUTOCON 0x8001 /* AutoConfig device */ + /* (struct ConfigDev) */ +#define BI_AMIGA_CHIP_SIZE 0x8002 /* size of Chip RAM (u_long) */ +#define BI_AMIGA_VBLANK 0x8003 /* VBLANK frequency (u_char) */ +#define BI_AMIGA_PSFREQ 0x8004 /* power supply frequency (u_char) */ +#define BI_AMIGA_ECLOCK 0x8005 /* EClock frequency (u_long) */ +#define BI_AMIGA_CHIPSET 0x8006 /* native chipset present (u_long) */ +#define BI_AMIGA_SERPER 0x8007 /* serial port period (u_short) */ + + /* + * Atari-specific tags + */ + +#define BI_ATARI_MCH_COOKIE 0x8000 /* _MCH cookie from TOS (u_long) */ +#define BI_ATARI_MCH_TYPE 0x8001 /* special machine type (u_long) */ + /* (values are ATARI_MACH_* defines */ + +/* mch_cookie values (upper word) */ +#define ATARI_MCH_ST 0 +#define ATARI_MCH_STE 1 +#define ATARI_MCH_TT 2 +#define ATARI_MCH_FALCON 3 + +/* mch_type values */ +#define ATARI_MACH_NORMAL 0 /* no special machine type */ +#define ATARI_MACH_MEDUSA 1 /* Medusa 040 */ +#define ATARI_MACH_HADES 2 /* Hades 040 or 060 */ +#define ATARI_MACH_AB40 3 /* Afterburner040 on Falcon */ + + /* + * VME-specific tags + */ + +#define BI_VME_TYPE 0x8000 /* VME sub-architecture (u_long) */ +#define BI_VME_BRDINFO 0x8001 /* VME board information (struct) */ + +/* BI_VME_TYPE codes */ +#define VME_TYPE_TP34V 0x0034 /* Tadpole TP34V */ +#define VME_TYPE_MVME147 0x0147 /* Motorola MVME147 */ +#define VME_TYPE_MVME162 0x0162 /* Motorola MVME162 */ +#define VME_TYPE_MVME166 0x0166 /* Motorola MVME166 */ +#define VME_TYPE_MVME167 0x0167 /* Motorola MVME167 */ +#define VME_TYPE_MVME172 0x0172 /* Motorola MVME172 */ +#define VME_TYPE_MVME177 0x0177 /* Motorola MVME177 */ +#define VME_TYPE_BVME4000 0x4000 /* BVM Ltd. BVME4000 */ +#define VME_TYPE_BVME6000 0x6000 /* BVM Ltd. BVME6000 */ + +/* BI_VME_BRDINFO is a 32 byte struct as returned by the Bug code on + * Motorola VME boards. Contains board number, Bug version, board + * configuration options, etc. See include/asm/mvme16xhw.h for details. + */ + + + /* + * Macintosh-specific tags (all u_long) + */ + +#define BI_MAC_MODEL 0x8000 /* Mac Gestalt ID (model type) */ +#define BI_MAC_VADDR 0x8001 /* Mac video base address */ +#define BI_MAC_VDEPTH 0x8002 /* Mac video depth */ +#define BI_MAC_VROW 0x8003 /* Mac video rowbytes */ +#define BI_MAC_VDIM 0x8004 /* Mac video dimensions */ +#define BI_MAC_VLOGICAL 0x8005 /* Mac video logical base */ +#define BI_MAC_SCCBASE 0x8006 /* Mac SCC base address */ +#define BI_MAC_BTIME 0x8007 /* Mac boot time */ +#define BI_MAC_GMTBIAS 0x8008 /* Mac GMT timezone offset */ +#define BI_MAC_MEMSIZE 0x8009 /* Mac RAM size (sanity check) */ +#define BI_MAC_CPUID 0x800a /* Mac CPU type (sanity check) */ +#define BI_MAC_ROMBASE 0x800b /* Mac system ROM base address */ + + /* + * Macintosh hardware profile data - unused, see macintosh.h for + * resonable type values + */ + +#define BI_MAC_VIA1BASE 0x8010 /* Mac VIA1 base address (always present) */ +#define BI_MAC_VIA2BASE 0x8011 /* Mac VIA2 base address (type varies) */ +#define BI_MAC_VIA2TYPE 0x8012 /* Mac VIA2 type (VIA, RBV, OSS) */ +#define BI_MAC_ADBTYPE 0x8013 /* Mac ADB interface type */ +#define BI_MAC_ASCBASE 0x8014 /* Mac Apple Sound Chip base address */ +#define BI_MAC_SCSI5380 0x8015 /* Mac NCR 5380 SCSI (base address, multi) */ +#define BI_MAC_SCSIDMA 0x8016 /* Mac SCSI DMA (base address) */ +#define BI_MAC_SCSI5396 0x8017 /* Mac NCR 53C96 SCSI (base address, multi) */ +#define BI_MAC_IDETYPE 0x8018 /* Mac IDE interface type */ +#define BI_MAC_IDEBASE 0x8019 /* Mac IDE interface base address */ +#define BI_MAC_NUBUS 0x801a /* Mac Nubus type (none, regular, pseudo) */ +#define BI_MAC_SLOTMASK 0x801b /* Mac Nubus slots present */ +#define BI_MAC_SCCTYPE 0x801c /* Mac SCC serial type (normal, IOP) */ +#define BI_MAC_ETHTYPE 0x801d /* Mac builtin ethernet type (Sonic, MACE */ +#define BI_MAC_ETHBASE 0x801e /* Mac builtin ethernet base address */ +#define BI_MAC_PMU 0x801f /* Mac power management / poweroff hardware */ +#define BI_MAC_IOP_SWIM 0x8020 /* Mac SWIM floppy IOP */ +#define BI_MAC_IOP_ADB 0x8021 /* Mac ADB IOP */ + + /* + * Mac: compatibility with old booter data format (temporarily) + * Fields unused with the new bootinfo can be deleted now; instead of + * adding new fields the struct might be splitted into a hardware address + * part and a hardware type part + */ + +#ifndef __ASSEMBLY__ + +struct mac_booter_data +{ + unsigned long videoaddr; + unsigned long videorow; + unsigned long videodepth; + unsigned long dimensions; + unsigned long args; + unsigned long boottime; + unsigned long gmtbias; + unsigned long bootver; + unsigned long videological; + unsigned long sccbase; + unsigned long id; + unsigned long memsize; + unsigned long serialmf; + unsigned long serialhsk; + unsigned long serialgpi; + unsigned long printmf; + unsigned long printhsk; + unsigned long printgpi; + unsigned long cpuid; + unsigned long rombase; + unsigned long adbdelay; + unsigned long timedbra; +}; + +extern struct mac_booter_data + mac_bi_data; + #endif + + /* + * Apollo-specific tags + */ + +#define BI_APOLLO_MODEL 0x8000 /* model (u_long) */ + + /* + * HP300-specific tags + */ + +#define BI_HP300_MODEL 0x8000 /* model (u_long) */ +#define BI_HP300_UART_SCODE 0x8001 /* UART select code (u_long) */ +#define BI_HP300_UART_ADDR 0x8002 /* phys. addr of UART (u_long) */ + + /* + * Stuff for bootinfo interface versioning + * + * At the start of kernel code, a 'struct bootversion' is located. + * bootstrap checks for a matching version of the interface before booting + * a kernel, to avoid user confusion if kernel and bootstrap don't work + * together :-) + * + * If incompatible changes are made to the bootinfo interface, the major + * number below should be stepped (and the minor reset to 0) for the + * appropriate machine. If a change is backward-compatible, the minor + * should be stepped. "Backwards-compatible" means that booting will work, + * but certain features may not. + */ + +#define BOOTINFOV_MAGIC 0x4249561A /* 'BIV^Z' */ +#define MK_BI_VERSION(major,minor) (((major)<<16)+(minor)) +#define BI_VERSION_MAJOR(v) (((v) >> 16) & 0xffff) +#define BI_VERSION_MINOR(v) ((v) & 0xffff) + +#ifndef __ASSEMBLY__ + +struct bootversion { + unsigned short branch; + unsigned long magic; + struct { + unsigned long machtype; + unsigned long version; + } machversions[0]; +}; + +#endif /* __ASSEMBLY__ */ + +#define AMIGA_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define ATARI_BOOTI_VERSION MK_BI_VERSION( 2, 1 ) +#define MAC_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define MVME147_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define MVME16x_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define BVME6000_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define Q40_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) +#define HP300_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) + +#ifdef BOOTINFO_COMPAT_1_0 + + /* + * Backwards compatibility with bootinfo interface version 1.0 + */ + +#define COMPAT_AMIGA_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) +#define COMPAT_ATARI_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) +#define COMPAT_MAC_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) + +#include + +#define COMPAT_NUM_AUTO 16 + +struct compat_bi_Amiga { + int model; + int num_autocon; + struct ConfigDev autocon[COMPAT_NUM_AUTO]; + unsigned long chip_size; + unsigned char vblank; + unsigned char psfreq; + unsigned long eclock; + unsigned long chipset; + unsigned long hw_present; +}; + +struct compat_bi_Atari { + unsigned long hw_present; + unsigned long mch_cookie; +}; + +#ifndef __ASSEMBLY__ + +struct compat_bi_Macintosh +{ + unsigned long videoaddr; + unsigned long videorow; + unsigned long videodepth; + unsigned long dimensions; + unsigned long args; + unsigned long boottime; + unsigned long gmtbias; + unsigned long bootver; + unsigned long videological; + unsigned long sccbase; + unsigned long id; + unsigned long memsize; + unsigned long serialmf; + unsigned long serialhsk; + unsigned long serialgpi; + unsigned long printmf; + unsigned long printhsk; + unsigned long printgpi; + unsigned long cpuid; + unsigned long rombase; + unsigned long adbdelay; + unsigned long timedbra; +}; + +#endif + +struct compat_mem_info { + unsigned long addr; + unsigned long size; +}; + +#define COMPAT_NUM_MEMINFO 4 + +#define COMPAT_CPUB_68020 0 +#define COMPAT_CPUB_68030 1 +#define COMPAT_CPUB_68040 2 +#define COMPAT_CPUB_68060 3 +#define COMPAT_FPUB_68881 5 +#define COMPAT_FPUB_68882 6 +#define COMPAT_FPUB_68040 7 +#define COMPAT_FPUB_68060 8 + +#define COMPAT_CPU_68020 (1<> 16) & 0xffff) -#define BI_VERSION_MINOR(v) ((v) & 0xffff) - -#ifndef __ASSEMBLY__ - -struct bootversion { - unsigned short branch; - unsigned long magic; - struct { - unsigned long machtype; - unsigned long version; - } machversions[0]; -}; - -#endif /* __ASSEMBLY__ */ - -#define AMIGA_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define ATARI_BOOTI_VERSION MK_BI_VERSION( 2, 1 ) -#define MAC_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define MVME147_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define MVME16x_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define BVME6000_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define Q40_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) -#define HP300_BOOTI_VERSION MK_BI_VERSION( 2, 0 ) - -#ifdef BOOTINFO_COMPAT_1_0 - - /* - * Backwards compatibility with bootinfo interface version 1.0 - */ - -#define COMPAT_AMIGA_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) -#define COMPAT_ATARI_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) -#define COMPAT_MAC_BOOTI_VERSION MK_BI_VERSION( 1, 0 ) - -#include - -#define COMPAT_NUM_AUTO 16 - -struct compat_bi_Amiga { - int model; - int num_autocon; - struct ConfigDev autocon[COMPAT_NUM_AUTO]; - unsigned long chip_size; - unsigned char vblank; - unsigned char psfreq; - unsigned long eclock; - unsigned long chipset; - unsigned long hw_present; -}; - -struct compat_bi_Atari { - unsigned long hw_present; - unsigned long mch_cookie; -}; - -#ifndef __ASSEMBLY__ - -struct compat_bi_Macintosh -{ - unsigned long videoaddr; - unsigned long videorow; - unsigned long videodepth; - unsigned long dimensions; - unsigned long args; - unsigned long boottime; - unsigned long gmtbias; - unsigned long bootver; - unsigned long videological; - unsigned long sccbase; - unsigned long id; - unsigned long memsize; - unsigned long serialmf; - unsigned long serialhsk; - unsigned long serialgpi; - unsigned long printmf; - unsigned long printhsk; - unsigned long printgpi; - unsigned long cpuid; - unsigned long rombase; - unsigned long adbdelay; - unsigned long timedbra; -}; - -#endif - -struct compat_mem_info { - unsigned long addr; - unsigned long size; -}; - -#define COMPAT_NUM_MEMINFO 4 - -#define COMPAT_CPUB_68020 0 -#define COMPAT_CPUB_68030 1 -#define COMPAT_CPUB_68040 2 -#define COMPAT_CPUB_68060 3 -#define COMPAT_FPUB_68881 5 -#define COMPAT_FPUB_68882 6 -#define COMPAT_FPUB_68040 7 -#define COMPAT_FPUB_68060 8 - -#define COMPAT_CPU_68020 (1< Date: Fri, 6 Feb 2009 14:34:47 +1000 Subject: m68k: use the mmu version of cache.h for m68knommu as well The non-mmu version of cache.h is almost the same as the mmu version. Merge them. Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/cache.h | 14 ++++++++++---- arch/m68k/include/asm/cache_mm.h | 11 ----------- arch/m68k/include/asm/cache_no.h | 12 ------------ 3 files changed, 10 insertions(+), 27 deletions(-) delete mode 100644 arch/m68k/include/asm/cache_mm.h delete mode 100644 arch/m68k/include/asm/cache_no.h diff --git a/arch/m68k/include/asm/cache.h b/arch/m68k/include/asm/cache.h index 599c29bc8f40..fed3fd30de7e 100644 --- a/arch/m68k/include/asm/cache.h +++ b/arch/m68k/include/asm/cache.h @@ -1,5 +1,11 @@ -#ifdef __uClinux__ -#include "cache_no.h" -#else -#include "cache_mm.h" +/* + * include/asm-m68k/cache.h + */ +#ifndef __ARCH_M68K_CACHE_H +#define __ARCH_M68K_CACHE_H + +/* bytes per L1 cache line */ +#define L1_CACHE_SHIFT 4 +#define L1_CACHE_BYTES (1<< L1_CACHE_SHIFT) + #endif diff --git a/arch/m68k/include/asm/cache_mm.h b/arch/m68k/include/asm/cache_mm.h deleted file mode 100644 index fed3fd30de7e..000000000000 --- a/arch/m68k/include/asm/cache_mm.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * include/asm-m68k/cache.h - */ -#ifndef __ARCH_M68K_CACHE_H -#define __ARCH_M68K_CACHE_H - -/* bytes per L1 cache line */ -#define L1_CACHE_SHIFT 4 -#define L1_CACHE_BYTES (1<< L1_CACHE_SHIFT) - -#endif diff --git a/arch/m68k/include/asm/cache_no.h b/arch/m68k/include/asm/cache_no.h deleted file mode 100644 index 24e9eace5f8c..000000000000 --- a/arch/m68k/include/asm/cache_no.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __ARCH_M68KNOMMU_CACHE_H -#define __ARCH_M68KNOMMU_CACHE_H - -/* bytes per L1 cache line */ -#define L1_CACHE_BYTES 16 /* this need to be at least 1 */ - -/* m68k-elf-gcc 2.95.2 doesn't like these */ - -#define __cacheline_aligned -#define ____cacheline_aligned - -#endif -- cgit v1.2.3-59-g8ed1b From dbc367eb69cedf47c702310e4abe1514d52abb4b Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:44:11 +1000 Subject: m68k: merge the mmu and non-mmu versions of bug.h Trivial merge of the mmu and non-mmu versions of bug.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/bug.h | 31 ++++++++++++++++++++++++++++--- arch/m68k/include/asm/bug_mm.h | 29 ----------------------------- arch/m68k/include/asm/bug_no.h | 4 ---- 3 files changed, 28 insertions(+), 36 deletions(-) delete mode 100644 arch/m68k/include/asm/bug_mm.h delete mode 100644 arch/m68k/include/asm/bug_no.h diff --git a/arch/m68k/include/asm/bug.h b/arch/m68k/include/asm/bug.h index 997e0944ebc1..ef9a2e47352f 100644 --- a/arch/m68k/include/asm/bug.h +++ b/arch/m68k/include/asm/bug.h @@ -1,5 +1,30 @@ -#ifdef __uClinux__ -#include "bug_no.h" +#ifndef _M68K_BUG_H +#define _M68K_BUG_H + +#ifdef CONFIG_MMU +#ifdef CONFIG_BUG +#ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_SUN3 +#define BUG() do { \ + printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + __builtin_trap(); \ +} while (0) #else -#include "bug_mm.h" +#define BUG() do { \ + printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + panic("BUG!"); \ +} while (0) +#endif +#else +#define BUG() do { \ + __builtin_trap(); \ +} while (0) +#endif + +#define HAVE_ARCH_BUG +#endif +#endif /* CONFIG_MMU */ + +#include + #endif diff --git a/arch/m68k/include/asm/bug_mm.h b/arch/m68k/include/asm/bug_mm.h deleted file mode 100644 index e5b528deb8a8..000000000000 --- a/arch/m68k/include/asm/bug_mm.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _M68K_BUG_H -#define _M68K_BUG_H - - -#ifdef CONFIG_BUG -#ifdef CONFIG_DEBUG_BUGVERBOSE -#ifndef CONFIG_SUN3 -#define BUG() do { \ - printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ - __builtin_trap(); \ -} while (0) -#else -#define BUG() do { \ - printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ - panic("BUG!"); \ -} while (0) -#endif -#else -#define BUG() do { \ - __builtin_trap(); \ -} while (0) -#endif - -#define HAVE_ARCH_BUG -#endif - -#include - -#endif diff --git a/arch/m68k/include/asm/bug_no.h b/arch/m68k/include/asm/bug_no.h deleted file mode 100644 index 70e7dc0af21a..000000000000 --- a/arch/m68k/include/asm/bug_no.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef _M68KNOMMU_BUG_H -#define _M68KNOMMU_BUG_H -#include -#endif -- cgit v1.2.3-59-g8ed1b From 76adcb2e812464c38b42f18ab76e78e71df40464 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:45:23 +1000 Subject: m68k: merge the mmu and non-mmu versions of bugs.h Trivial merge of the mmu and non-mmu versions of bugs.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/bugs.h | 21 ++++++++++++++++++--- arch/m68k/include/asm/bugs_mm.h | 14 -------------- arch/m68k/include/asm/bugs_no.h | 16 ---------------- 3 files changed, 18 insertions(+), 33 deletions(-) delete mode 100644 arch/m68k/include/asm/bugs_mm.h delete mode 100644 arch/m68k/include/asm/bugs_no.h diff --git a/arch/m68k/include/asm/bugs.h b/arch/m68k/include/asm/bugs.h index 01f047d784ec..d06207b9ba5a 100644 --- a/arch/m68k/include/asm/bugs.h +++ b/arch/m68k/include/asm/bugs.h @@ -1,5 +1,20 @@ -#ifdef __uClinux__ -#include "bugs_no.h" +/* + * include/asm-m68k/bugs.h + * + * Copyright (C) 1994 Linus Torvalds + */ + +/* + * This is included by init/main.c to check for architecture-dependent bugs. + * + * Needs: + * void check_bugs(void); + */ + +#ifdef CONFIG_MMU +extern void check_bugs(void); /* in arch/m68k/kernel/setup.c */ #else -#include "bugs_mm.h" +static void check_bugs(void) +{ +} #endif diff --git a/arch/m68k/include/asm/bugs_mm.h b/arch/m68k/include/asm/bugs_mm.h deleted file mode 100644 index d01935592410..000000000000 --- a/arch/m68k/include/asm/bugs_mm.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * include/asm-m68k/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -/* - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ - -extern void check_bugs(void); /* in arch/m68k/kernel/setup.c */ diff --git a/arch/m68k/include/asm/bugs_no.h b/arch/m68k/include/asm/bugs_no.h deleted file mode 100644 index 5f382dac3a60..000000000000 --- a/arch/m68k/include/asm/bugs_no.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * include/asm-m68k/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -/* - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ - -static void check_bugs(void) -{ -} -- cgit v1.2.3-59-g8ed1b From d20f5aa338dc75fb2e7bfb7627d3dfcc81196e69 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:50:08 +1000 Subject: m68k: merge the mmu and non-mmu versions of div64.h Trivial merge of the mmu and non-mmu versions of div64.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/div64.h | 37 +++++++++++++++++++++++++++++++++---- arch/m68k/include/asm/div64_mm.h | 28 ---------------------------- arch/m68k/include/asm/div64_no.h | 1 - 3 files changed, 33 insertions(+), 33 deletions(-) delete mode 100644 arch/m68k/include/asm/div64_mm.h delete mode 100644 arch/m68k/include/asm/div64_no.h diff --git a/arch/m68k/include/asm/div64.h b/arch/m68k/include/asm/div64.h index d211d9f54276..edb66148a71d 100644 --- a/arch/m68k/include/asm/div64.h +++ b/arch/m68k/include/asm/div64.h @@ -1,5 +1,34 @@ -#ifdef __uClinux__ -#include "div64_no.h" +#ifndef _M68K_DIV64_H +#define _M68K_DIV64_H + +#ifdef CONFIG_MMU + +#include + +/* n = n / base; return rem; */ + +#define do_div(n, base) ({ \ + union { \ + unsigned long n32[2]; \ + unsigned long long n64; \ + } __n; \ + unsigned long __rem, __upper; \ + \ + __n.n64 = (n); \ + if ((__upper = __n.n32[0])) { \ + asm ("divul.l %2,%1:%0" \ + : "=d" (__n.n32[0]), "=d" (__upper) \ + : "d" (base), "0" (__n.n32[0])); \ + } \ + asm ("divu.l %2,%1:%0" \ + : "=d" (__n.n32[1]), "=d" (__rem) \ + : "d" (base), "1" (__upper), "0" (__n.n32[1])); \ + (n) = __n.n64; \ + __rem; \ +}) + #else -#include "div64_mm.h" -#endif +#include +#endif /* CONFIG_MMU */ + +#endif /* _M68K_DIV64_H */ diff --git a/arch/m68k/include/asm/div64_mm.h b/arch/m68k/include/asm/div64_mm.h deleted file mode 100644 index 8243c931b5c0..000000000000 --- a/arch/m68k/include/asm/div64_mm.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _M68K_DIV64_H -#define _M68K_DIV64_H - -#include - -/* n = n / base; return rem; */ - -#define do_div(n, base) ({ \ - union { \ - unsigned long n32[2]; \ - unsigned long long n64; \ - } __n; \ - unsigned long __rem, __upper; \ - \ - __n.n64 = (n); \ - if ((__upper = __n.n32[0])) { \ - asm ("divul.l %2,%1:%0" \ - : "=d" (__n.n32[0]), "=d" (__upper) \ - : "d" (base), "0" (__n.n32[0])); \ - } \ - asm ("divu.l %2,%1:%0" \ - : "=d" (__n.n32[1]), "=d" (__rem) \ - : "d" (base), "1" (__upper), "0" (__n.n32[1])); \ - (n) = __n.n64; \ - __rem; \ -}) - -#endif /* _M68K_DIV64_H */ diff --git a/arch/m68k/include/asm/div64_no.h b/arch/m68k/include/asm/div64_no.h deleted file mode 100644 index 6cd978cefb28..000000000000 --- a/arch/m68k/include/asm/div64_no.h +++ /dev/null @@ -1 +0,0 @@ -#include -- cgit v1.2.3-59-g8ed1b From 2844b660358c9230a2e85a0218f0df4046d5c392 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 6 Feb 2009 14:54:08 +1000 Subject: m68k: merge the mmu and non-mmu versions of current.h Trivial merge of the mmu and non-mmu versions of current.h Signed-off-by: Greg Ungerer Acked-by: Geert Uytterhoeven --- arch/m68k/include/asm/current.h | 31 +++++++++++++++++++++++++++---- arch/m68k/include/asm/current_mm.h | 6 ------ arch/m68k/include/asm/current_no.h | 24 ------------------------ 3 files changed, 27 insertions(+), 34 deletions(-) delete mode 100644 arch/m68k/include/asm/current_mm.h delete mode 100644 arch/m68k/include/asm/current_no.h diff --git a/arch/m68k/include/asm/current.h b/arch/m68k/include/asm/current.h index 51b056dfaedd..91fcc5358cfe 100644 --- a/arch/m68k/include/asm/current.h +++ b/arch/m68k/include/asm/current.h @@ -1,5 +1,28 @@ -#ifdef __uClinux__ -#include "current_no.h" +#ifndef _M68K_CURRENT_H +#define _M68K_CURRENT_H + +#ifdef CONFIG_MMU + +register struct task_struct *current __asm__("%a2"); + #else -#include "current_mm.h" -#endif + +/* + * Rather than dedicate a register (as the m68k source does), we + * just keep a global, we should probably just change it all to be + * current and lose _current_task. + */ +#include + +struct task_struct; + +static inline struct task_struct *get_current(void) +{ + return(current_thread_info()->task); +} + +#define current get_current() + +#endif /* CONFNIG_MMU */ + +#endif /* !(_M68K_CURRENT_H) */ diff --git a/arch/m68k/include/asm/current_mm.h b/arch/m68k/include/asm/current_mm.h deleted file mode 100644 index 8de8f8ceda61..000000000000 --- a/arch/m68k/include/asm/current_mm.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _M68K_CURRENT_H -#define _M68K_CURRENT_H - -register struct task_struct *current __asm__("%a2"); - -#endif /* !(_M68K_CURRENT_H) */ diff --git a/arch/m68k/include/asm/current_no.h b/arch/m68k/include/asm/current_no.h deleted file mode 100644 index 53ee0f9f7cef..000000000000 --- a/arch/m68k/include/asm/current_no.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _M68KNOMMU_CURRENT_H -#define _M68KNOMMU_CURRENT_H -/* - * current.h - * (C) Copyright 2000, Lineo, David McCullough - * (C) Copyright 2002, Greg Ungerer (gerg@snapgear.com) - * - * rather than dedicate a register (as the m68k source does), we - * just keep a global, we should probably just change it all to be - * current and lose _current_task. - */ - -#include - -struct task_struct; - -static inline struct task_struct *get_current(void) -{ - return(current_thread_info()->task); -} - -#define current get_current() - -#endif /* _M68KNOMMU_CURRENT_H */ -- cgit v1.2.3-59-g8ed1b From 66d73f00a617be4feb45885a56fc5e6441b30a92 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 10 Feb 2009 09:49:00 +1000 Subject: m68k: merge the mmu and non-mmu versions of mmu_context.h Simple merge of the mmu and non-mmu versions of mmu_context.h Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/mmu_context.h | 176 ++++++++++++++++++++++++++++++++- arch/m68k/include/asm/mmu_context_mm.h | 154 ----------------------------- arch/m68k/include/asm/mmu_context_no.h | 33 ------- 3 files changed, 173 insertions(+), 190 deletions(-) delete mode 100644 arch/m68k/include/asm/mmu_context_mm.h delete mode 100644 arch/m68k/include/asm/mmu_context_no.h diff --git a/arch/m68k/include/asm/mmu_context.h b/arch/m68k/include/asm/mmu_context.h index b440928fc6c7..7d4341e55a99 100644 --- a/arch/m68k/include/asm/mmu_context.h +++ b/arch/m68k/include/asm/mmu_context.h @@ -1,5 +1,175 @@ -#ifdef __uClinux__ -#include "mmu_context_no.h" +#ifndef __M68K_MMU_CONTEXT_H +#define __M68K_MMU_CONTEXT_H + +#include + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +} + +#ifdef CONFIG_MMU +#ifndef CONFIG_SUN3 + +#include +#include +#include + +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + mm->context = virt_to_phys(mm->pgd); + return 0; +} + +#define destroy_context(mm) do { } while(0) + +static inline void switch_mm_0230(struct mm_struct *mm) +{ + unsigned long crp[2] = { + 0x80000000 | _PAGE_TABLE, mm->context + }; + unsigned long tmp; + + asm volatile (".chip 68030"); + + /* flush MC68030/MC68020 caches (they are virtually addressed) */ + asm volatile ( + "movec %%cacr,%0;" + "orw %1,%0; " + "movec %0,%%cacr" + : "=d" (tmp) : "di" (FLUSH_I_AND_D)); + + /* Switch the root pointer. For a 030-only kernel, + * avoid flushing the whole ATC, we only need to + * flush the user entries. The 68851 does this by + * itself. Avoid a runtime check here. + */ + asm volatile ( +#ifdef CPU_M68030_ONLY + "pmovefd %0,%%crp; " + "pflush #0,#4" #else -#include "mmu_context_mm.h" + "pmove %0,%%crp" #endif + : : "m" (crp[0])); + + asm volatile (".chip 68k"); +} + +static inline void switch_mm_0460(struct mm_struct *mm) +{ + asm volatile (".chip 68040"); + + /* flush address translation cache (user entries) */ + asm volatile ("pflushan"); + + /* switch the root pointer */ + asm volatile ("movec %0,%%urp" : : "r" (mm->context)); + + if (CPU_IS_060) { + unsigned long tmp; + + /* clear user entries in the branch cache */ + asm volatile ( + "movec %%cacr,%0; " + "orl %1,%0; " + "movec %0,%%cacr" + : "=d" (tmp): "di" (0x00200000)); + } + + asm volatile (".chip 68k"); +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) +{ + if (prev != next) { + if (CPU_IS_020_OR_030) + switch_mm_0230(next); + else + switch_mm_0460(next); + } +} + +#define deactivate_mm(tsk,mm) do { } while (0) + +static inline void activate_mm(struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + next_mm->context = virt_to_phys(next_mm->pgd); + + if (CPU_IS_020_OR_030) + switch_mm_0230(next_mm); + else + switch_mm_0460(next_mm); +} + +#else /* CONFIG_SUN3 */ +#include +#include + +extern unsigned long get_free_context(struct mm_struct *mm); +extern void clear_context(unsigned long context); + +/* set the context for a new task to unmapped */ +static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + mm->context = SUN3_INVALID_CONTEXT; + return 0; +} + +/* find the context given to this process, and if it hasn't already + got one, go get one for it. */ +static inline void get_mmu_context(struct mm_struct *mm) +{ + if(mm->context == SUN3_INVALID_CONTEXT) + mm->context = get_free_context(mm); +} + +/* flush context if allocated... */ +static inline void destroy_context(struct mm_struct *mm) +{ + if(mm->context != SUN3_INVALID_CONTEXT) + clear_context(mm->context); +} + +static inline void activate_context(struct mm_struct *mm) +{ + get_mmu_context(mm); + sun3_put_context(mm->context); +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) +{ + activate_context(tsk->mm); +} + +#define deactivate_mm(tsk,mm) do { } while (0) + +static inline void activate_mm(struct mm_struct *prev_mm, + struct mm_struct *next_mm) +{ + activate_context(next_mm); +} + +#endif +#else /* !CONFIG_MMU */ + +static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + return 0; +} + + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) +{ +} + +#define destroy_context(mm) do { } while (0) +#define deactivate_mm(tsk,mm) do { } while (0) + +static inline void activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm) +{ +} + +#endif /* CONFIG_MMU */ +#endif /* __M68K_MMU_CONTEXT_H */ diff --git a/arch/m68k/include/asm/mmu_context_mm.h b/arch/m68k/include/asm/mmu_context_mm.h deleted file mode 100644 index 894dacbcee14..000000000000 --- a/arch/m68k/include/asm/mmu_context_mm.h +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef __M68K_MMU_CONTEXT_H -#define __M68K_MMU_CONTEXT_H - -#include - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} - -#ifndef CONFIG_SUN3 - -#include -#include -#include - -static inline int init_new_context(struct task_struct *tsk, - struct mm_struct *mm) -{ - mm->context = virt_to_phys(mm->pgd); - return 0; -} - -#define destroy_context(mm) do { } while(0) - -static inline void switch_mm_0230(struct mm_struct *mm) -{ - unsigned long crp[2] = { - 0x80000000 | _PAGE_TABLE, mm->context - }; - unsigned long tmp; - - asm volatile (".chip 68030"); - - /* flush MC68030/MC68020 caches (they are virtually addressed) */ - asm volatile ( - "movec %%cacr,%0;" - "orw %1,%0; " - "movec %0,%%cacr" - : "=d" (tmp) : "di" (FLUSH_I_AND_D)); - - /* Switch the root pointer. For a 030-only kernel, - * avoid flushing the whole ATC, we only need to - * flush the user entries. The 68851 does this by - * itself. Avoid a runtime check here. - */ - asm volatile ( -#ifdef CPU_M68030_ONLY - "pmovefd %0,%%crp; " - "pflush #0,#4" -#else - "pmove %0,%%crp" -#endif - : : "m" (crp[0])); - - asm volatile (".chip 68k"); -} - -static inline void switch_mm_0460(struct mm_struct *mm) -{ - asm volatile (".chip 68040"); - - /* flush address translation cache (user entries) */ - asm volatile ("pflushan"); - - /* switch the root pointer */ - asm volatile ("movec %0,%%urp" : : "r" (mm->context)); - - if (CPU_IS_060) { - unsigned long tmp; - - /* clear user entries in the branch cache */ - asm volatile ( - "movec %%cacr,%0; " - "orl %1,%0; " - "movec %0,%%cacr" - : "=d" (tmp): "di" (0x00200000)); - } - - asm volatile (".chip 68k"); -} - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) -{ - if (prev != next) { - if (CPU_IS_020_OR_030) - switch_mm_0230(next); - else - switch_mm_0460(next); - } -} - -#define deactivate_mm(tsk,mm) do { } while (0) - -static inline void activate_mm(struct mm_struct *prev_mm, - struct mm_struct *next_mm) -{ - next_mm->context = virt_to_phys(next_mm->pgd); - - if (CPU_IS_020_OR_030) - switch_mm_0230(next_mm); - else - switch_mm_0460(next_mm); -} - -#else /* CONFIG_SUN3 */ -#include -#include - -extern unsigned long get_free_context(struct mm_struct *mm); -extern void clear_context(unsigned long context); - -/* set the context for a new task to unmapped */ -static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - mm->context = SUN3_INVALID_CONTEXT; - return 0; -} - -/* find the context given to this process, and if it hasn't already - got one, go get one for it. */ -static inline void get_mmu_context(struct mm_struct *mm) -{ - if(mm->context == SUN3_INVALID_CONTEXT) - mm->context = get_free_context(mm); -} - -/* flush context if allocated... */ -static inline void destroy_context(struct mm_struct *mm) -{ - if(mm->context != SUN3_INVALID_CONTEXT) - clear_context(mm->context); -} - -static inline void activate_context(struct mm_struct *mm) -{ - get_mmu_context(mm); - sun3_put_context(mm->context); -} - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) -{ - activate_context(tsk->mm); -} - -#define deactivate_mm(tsk,mm) do { } while (0) - -static inline void activate_mm(struct mm_struct *prev_mm, - struct mm_struct *next_mm) -{ - activate_context(next_mm); -} - -#endif -#endif diff --git a/arch/m68k/include/asm/mmu_context_no.h b/arch/m68k/include/asm/mmu_context_no.h deleted file mode 100644 index 9ccee4278c97..000000000000 --- a/arch/m68k/include/asm/mmu_context_no.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __M68KNOMMU_MMU_CONTEXT_H -#define __M68KNOMMU_MMU_CONTEXT_H - -#include -#include -#include -#include - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} - -static inline int -init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - // mm->context = virt_to_phys(mm->pgd); - return(0); -} - -#define destroy_context(mm) do { } while(0) - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) -{ -} - -#define deactivate_mm(tsk,mm) do { } while (0) - -static inline void activate_mm(struct mm_struct *prev_mm, - struct mm_struct *next_mm) -{ -} - -#endif -- cgit v1.2.3-59-g8ed1b From e2545b65de4996ac99973a825060884ef31b9449 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 12 Feb 2009 16:02:05 +1000 Subject: m68knommu: add missing interrupt line definition for UART 2 Signed-off-by: David Wu Acked-by: Michael Durrant Acked-by: Oleksandr Zhadan Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/m532xsim.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/m68k/include/asm/m532xsim.h b/arch/m68k/include/asm/m532xsim.h index 1835fd20a82c..ce603451b55e 100644 --- a/arch/m68k/include/asm/m532xsim.h +++ b/arch/m68k/include/asm/m532xsim.h @@ -16,6 +16,7 @@ #define MCFINT_VECBASE 64 #define MCFINT_UART0 26 /* Interrupt number for UART0 */ #define MCFINT_UART1 27 /* Interrupt number for UART1 */ +#define MCFINT_UART2 28 /* Interrupt number for UART2 */ #define MCF_WTM_WCR MCF_REG16(0xFC098000) -- cgit v1.2.3-59-g8ed1b From a5505464c7c133d01f409426982aa28da111ceb8 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Thu, 19 Feb 2009 14:18:23 +1000 Subject: m68k: merge the mmu and non-mmu versions of module.h Simple merge of the mmu and non-mmu versions of module.h Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/module.h | 51 ++++++++++++++++++++++++++++++++++++--- arch/m68k/include/asm/module_mm.h | 39 ------------------------------ arch/m68k/include/asm/module_no.h | 11 --------- 3 files changed, 47 insertions(+), 54 deletions(-) delete mode 100644 arch/m68k/include/asm/module_mm.h delete mode 100644 arch/m68k/include/asm/module_no.h diff --git a/arch/m68k/include/asm/module.h b/arch/m68k/include/asm/module.h index 79b59d137dd0..5f21e11071bd 100644 --- a/arch/m68k/include/asm/module.h +++ b/arch/m68k/include/asm/module.h @@ -1,5 +1,48 @@ -#ifdef __uClinux__ -#include "module_no.h" +#ifndef _ASM_M68K_MODULE_H +#define _ASM_M68K_MODULE_H + +#ifdef CONFIG_MMU + +struct mod_arch_specific { + struct m68k_fixup_info *fixup_start, *fixup_end; +}; + +#define MODULE_ARCH_INIT { \ + .fixup_start = __start_fixup, \ + .fixup_end = __stop_fixup, \ +} + + +enum m68k_fixup_type { + m68k_fixup_memoffset, + m68k_fixup_vnode_shift, +}; + +struct m68k_fixup_info { + enum m68k_fixup_type type; + void *addr; +}; + +#define m68k_fixup(type, addr) \ + " .section \".m68k_fixup\",\"aw\"\n" \ + " .long " #type "," #addr "\n" \ + " .previous\n" + +extern struct m68k_fixup_info __start_fixup[], __stop_fixup[]; + +struct module; +extern void module_fixup(struct module *mod, struct m68k_fixup_info *start, + struct m68k_fixup_info *end); + #else -#include "module_mm.h" -#endif + +struct mod_arch_specific { +}; + +#endif /* CONFIG_MMU */ + +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr + +#endif /* _ASM_M68K_MODULE_H */ diff --git a/arch/m68k/include/asm/module_mm.h b/arch/m68k/include/asm/module_mm.h deleted file mode 100644 index 382d20a6fc18..000000000000 --- a/arch/m68k/include/asm/module_mm.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _ASM_M68K_MODULE_H -#define _ASM_M68K_MODULE_H - -struct mod_arch_specific { - struct m68k_fixup_info *fixup_start, *fixup_end; -}; - -#define MODULE_ARCH_INIT { \ - .fixup_start = __start_fixup, \ - .fixup_end = __stop_fixup, \ -} - -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr - - -enum m68k_fixup_type { - m68k_fixup_memoffset, - m68k_fixup_vnode_shift, -}; - -struct m68k_fixup_info { - enum m68k_fixup_type type; - void *addr; -}; - -#define m68k_fixup(type, addr) \ - " .section \".m68k_fixup\",\"aw\"\n" \ - " .long " #type "," #addr "\n" \ - " .previous\n" - -extern struct m68k_fixup_info __start_fixup[], __stop_fixup[]; - -struct module; -extern void module_fixup(struct module *mod, struct m68k_fixup_info *start, - struct m68k_fixup_info *end); - -#endif /* _ASM_M68K_MODULE_H */ diff --git a/arch/m68k/include/asm/module_no.h b/arch/m68k/include/asm/module_no.h deleted file mode 100644 index 2e45ab50b232..000000000000 --- a/arch/m68k/include/asm/module_no.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef ASM_M68KNOMMU_MODULE_H -#define ASM_M68KNOMMU_MODULE_H - -struct mod_arch_specific { -}; - -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr - -#endif /* ASM_M68KNOMMU_MODULE_H */ -- cgit v1.2.3-59-g8ed1b From facdf0ed4f594485fb2a1d2d024a150924c6b01c Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 4 Mar 2009 10:43:06 +1000 Subject: m68knommu: introduce basic clk infrastructure Create basic support for clk API. Recent changes to the FEC driver (used by many ColdFire family parts) need this. Initially only supports getting the master clock frequency. Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/coldfire/Makefile | 2 +- arch/m68knommu/platform/coldfire/clk.c | 40 +++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 arch/m68knommu/platform/coldfire/clk.c diff --git a/arch/m68knommu/platform/coldfire/Makefile b/arch/m68knommu/platform/coldfire/Makefile index 4f416a91a829..1bcb9372353f 100644 --- a/arch/m68knommu/platform/coldfire/Makefile +++ b/arch/m68knommu/platform/coldfire/Makefile @@ -14,7 +14,7 @@ asflags-$(CONFIG_FULLDEBUG) := -DDEBUGGER_COMPATIBLE_CACHE=1 -obj-$(CONFIG_COLDFIRE) += dma.o entry.o vectors.o +obj-$(CONFIG_COLDFIRE) += clk.o dma.o entry.o vectors.o obj-$(CONFIG_M5206) += timers.o obj-$(CONFIG_M5206e) += timers.o obj-$(CONFIG_M520x) += pit.o diff --git a/arch/m68knommu/platform/coldfire/clk.c b/arch/m68knommu/platform/coldfire/clk.c new file mode 100644 index 000000000000..7cdbf445b28f --- /dev/null +++ b/arch/m68knommu/platform/coldfire/clk.c @@ -0,0 +1,40 @@ +/***************************************************************************/ + +/* + * clk.c -- general ColdFire CPU kernel clk handling + * + * Copyright (C) 2009, Greg Ungerer (gerg@snapgear.com) + */ + +/***************************************************************************/ + +#include +#include +#include + +/***************************************************************************/ + +struct clk *clk_get(struct device *dev, const char *id) +{ + return NULL; +} + +int clk_enable(struct clk *clk) +{ + return 0; +} + +void clk_disable(struct clk *clk) +{ +} + +void clk_put(struct clk *clk) +{ +} + +unsigned long clk_get_rate(struct clk *clk) +{ + return MCF_CLK; +} + +/***************************************************************************/ -- cgit v1.2.3-59-g8ed1b From da4f4a02abebb8b6f49e1585acbb23921a5da410 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Wed, 11 Mar 2009 16:44:39 +1000 Subject: m68k: merge the mmu and non-mmu versions of tlbflush.h Simple merge of the mmu and non-mmu versions of tlbflush.h Signed-off-by: Greg Ungerer --- arch/m68k/include/asm/tlbflush.h | 268 +++++++++++++++++++++++++++++++++++- arch/m68k/include/asm/tlbflush_mm.h | 219 ----------------------------- arch/m68k/include/asm/tlbflush_no.h | 55 -------- 3 files changed, 265 insertions(+), 277 deletions(-) delete mode 100644 arch/m68k/include/asm/tlbflush_mm.h delete mode 100644 arch/m68k/include/asm/tlbflush_no.h diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index b6f93b30951e..a6b4ed4fc90f 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -1,5 +1,267 @@ -#ifdef __uClinux__ -#include "tlbflush_no.h" +#ifndef _M68K_TLBFLUSH_H +#define _M68K_TLBFLUSH_H + +#ifdef CONFIG_MMU +#ifndef CONFIG_SUN3 + +#include + +static inline void flush_tlb_kernel_page(void *addr) +{ + if (CPU_IS_040_OR_060) { + mm_segment_t old_fs = get_fs(); + set_fs(KERNEL_DS); + __asm__ __volatile__(".chip 68040\n\t" + "pflush (%0)\n\t" + ".chip 68k" + : : "a" (addr)); + set_fs(old_fs); + } else if (CPU_IS_020_OR_030) + __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr)); +} + +/* + * flush all user-space atc entries. + */ +static inline void __flush_tlb(void) +{ + if (CPU_IS_040_OR_060) + __asm__ __volatile__(".chip 68040\n\t" + "pflushan\n\t" + ".chip 68k"); + else if (CPU_IS_020_OR_030) + __asm__ __volatile__("pflush #0,#4"); +} + +static inline void __flush_tlb040_one(unsigned long addr) +{ + __asm__ __volatile__(".chip 68040\n\t" + "pflush (%0)\n\t" + ".chip 68k" + : : "a" (addr)); +} + +static inline void __flush_tlb_one(unsigned long addr) +{ + if (CPU_IS_040_OR_060) + __flush_tlb040_one(addr); + else if (CPU_IS_020_OR_030) + __asm__ __volatile__("pflush #0,#4,(%0)" : : "a" (addr)); +} + +#define flush_tlb() __flush_tlb() + +/* + * flush all atc entries (both kernel and user-space entries). + */ +static inline void flush_tlb_all(void) +{ + if (CPU_IS_040_OR_060) + __asm__ __volatile__(".chip 68040\n\t" + "pflusha\n\t" + ".chip 68k"); + else if (CPU_IS_020_OR_030) + __asm__ __volatile__("pflusha"); +} + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) { + mm_segment_t old_fs = get_fs(); + set_fs(USER_DS); + __flush_tlb_one(addr); + set_fs(old_fs); + } +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_tlb_all(); +} + #else -#include "tlbflush_mm.h" + + +/* Reserved PMEGs. */ +extern char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; +extern unsigned long pmeg_vaddr[SUN3_PMEGS_NUM]; +extern unsigned char pmeg_alloc[SUN3_PMEGS_NUM]; +extern unsigned char pmeg_ctx[SUN3_PMEGS_NUM]; + +/* Flush all userspace mappings one by one... (why no flush command, + sun?) */ +static inline void flush_tlb_all(void) +{ + unsigned long addr; + unsigned char ctx, oldctx; + + oldctx = sun3_get_context(); + for(addr = 0x00000000; addr < TASK_SIZE; addr += SUN3_PMEG_SIZE) { + for(ctx = 0; ctx < 8; ctx++) { + sun3_put_context(ctx); + sun3_put_segmap(addr, SUN3_INVALID_PMEG); + } + } + + sun3_put_context(oldctx); + /* erase all of the userspace pmeg maps, we've clobbered them + all anyway */ + for(addr = 0; addr < SUN3_INVALID_PMEG; addr++) { + if(pmeg_alloc[addr] == 1) { + pmeg_alloc[addr] = 0; + pmeg_ctx[addr] = 0; + pmeg_vaddr[addr] = 0; + } + } + +} + +/* Clear user TLB entries within the context named in mm */ +static inline void flush_tlb_mm (struct mm_struct *mm) +{ + unsigned char oldctx; + unsigned char seg; + unsigned long i; + + oldctx = sun3_get_context(); + sun3_put_context(mm->context); + + for(i = 0; i < TASK_SIZE; i += SUN3_PMEG_SIZE) { + seg = sun3_get_segmap(i); + if(seg == SUN3_INVALID_PMEG) + continue; + + sun3_put_segmap(i, SUN3_INVALID_PMEG); + pmeg_alloc[seg] = 0; + pmeg_ctx[seg] = 0; + pmeg_vaddr[seg] = 0; + } + + sun3_put_context(oldctx); + +} + +/* Flush a single TLB page. In this case, we're limited to flushing a + single PMEG */ +static inline void flush_tlb_page (struct vm_area_struct *vma, + unsigned long addr) +{ + unsigned char oldctx; + unsigned char i; + + oldctx = sun3_get_context(); + sun3_put_context(vma->vm_mm->context); + addr &= ~SUN3_PMEG_MASK; + if((i = sun3_get_segmap(addr)) != SUN3_INVALID_PMEG) + { + pmeg_alloc[i] = 0; + pmeg_ctx[i] = 0; + pmeg_vaddr[i] = 0; + sun3_put_segmap (addr, SUN3_INVALID_PMEG); + } + sun3_put_context(oldctx); + +} +/* Flush a range of pages from TLB. */ + +static inline void flush_tlb_range (struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned char seg, oldctx; + + start &= ~SUN3_PMEG_MASK; + + oldctx = sun3_get_context(); + sun3_put_context(mm->context); + + while(start < end) + { + if((seg = sun3_get_segmap(start)) == SUN3_INVALID_PMEG) + goto next; + if(pmeg_ctx[seg] == mm->context) { + pmeg_alloc[seg] = 0; + pmeg_ctx[seg] = 0; + pmeg_vaddr[seg] = 0; + } + sun3_put_segmap(start, SUN3_INVALID_PMEG); + next: + start += SUN3_PMEG_SIZE; + } +} + +static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_tlb_all(); +} + +/* Flush kernel page from TLB. */ +static inline void flush_tlb_kernel_page (unsigned long addr) +{ + sun3_put_segmap (addr & ~(SUN3_PMEG_SIZE - 1), SUN3_INVALID_PMEG); +} + #endif + +#else /* !CONFIG_MMU */ + +/* + * flush all user-space atc entries. + */ +static inline void __flush_tlb(void) +{ + BUG(); +} + +static inline void __flush_tlb_one(unsigned long addr) +{ + BUG(); +} + +#define flush_tlb() __flush_tlb() + +/* + * flush all atc entries (both kernel and user-space entries). + */ +static inline void flush_tlb_all(void) +{ + BUG(); +} + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + BUG(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) +{ + BUG(); +} + +static inline void flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + BUG(); +} + +static inline void flush_tlb_kernel_page(unsigned long addr) +{ + BUG(); +} + +#endif /* CONFIG_MMU */ + +#endif /* _M68K_TLBFLUSH_H */ diff --git a/arch/m68k/include/asm/tlbflush_mm.h b/arch/m68k/include/asm/tlbflush_mm.h deleted file mode 100644 index acb6bf21a321..000000000000 --- a/arch/m68k/include/asm/tlbflush_mm.h +++ /dev/null @@ -1,219 +0,0 @@ -#ifndef _M68K_TLBFLUSH_H -#define _M68K_TLBFLUSH_H - - -#ifndef CONFIG_SUN3 - -#include - -static inline void flush_tlb_kernel_page(void *addr) -{ - if (CPU_IS_040_OR_060) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - __asm__ __volatile__(".chip 68040\n\t" - "pflush (%0)\n\t" - ".chip 68k" - : : "a" (addr)); - set_fs(old_fs); - } else if (CPU_IS_020_OR_030) - __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr)); -} - -/* - * flush all user-space atc entries. - */ -static inline void __flush_tlb(void) -{ - if (CPU_IS_040_OR_060) - __asm__ __volatile__(".chip 68040\n\t" - "pflushan\n\t" - ".chip 68k"); - else if (CPU_IS_020_OR_030) - __asm__ __volatile__("pflush #0,#4"); -} - -static inline void __flush_tlb040_one(unsigned long addr) -{ - __asm__ __volatile__(".chip 68040\n\t" - "pflush (%0)\n\t" - ".chip 68k" - : : "a" (addr)); -} - -static inline void __flush_tlb_one(unsigned long addr) -{ - if (CPU_IS_040_OR_060) - __flush_tlb040_one(addr); - else if (CPU_IS_020_OR_030) - __asm__ __volatile__("pflush #0,#4,(%0)" : : "a" (addr)); -} - -#define flush_tlb() __flush_tlb() - -/* - * flush all atc entries (both kernel and user-space entries). - */ -static inline void flush_tlb_all(void) -{ - if (CPU_IS_040_OR_060) - __asm__ __volatile__(".chip 68040\n\t" - "pflusha\n\t" - ".chip 68k"); - else if (CPU_IS_020_OR_030) - __asm__ __volatile__("pflusha"); -} - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = get_fs(); - set_fs(USER_DS); - __flush_tlb_one(addr); - set_fs(old_fs); - } -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_all(); -} - -#else - - -/* Reserved PMEGs. */ -extern char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; -extern unsigned long pmeg_vaddr[SUN3_PMEGS_NUM]; -extern unsigned char pmeg_alloc[SUN3_PMEGS_NUM]; -extern unsigned char pmeg_ctx[SUN3_PMEGS_NUM]; - -/* Flush all userspace mappings one by one... (why no flush command, - sun?) */ -static inline void flush_tlb_all(void) -{ - unsigned long addr; - unsigned char ctx, oldctx; - - oldctx = sun3_get_context(); - for(addr = 0x00000000; addr < TASK_SIZE; addr += SUN3_PMEG_SIZE) { - for(ctx = 0; ctx < 8; ctx++) { - sun3_put_context(ctx); - sun3_put_segmap(addr, SUN3_INVALID_PMEG); - } - } - - sun3_put_context(oldctx); - /* erase all of the userspace pmeg maps, we've clobbered them - all anyway */ - for(addr = 0; addr < SUN3_INVALID_PMEG; addr++) { - if(pmeg_alloc[addr] == 1) { - pmeg_alloc[addr] = 0; - pmeg_ctx[addr] = 0; - pmeg_vaddr[addr] = 0; - } - } - -} - -/* Clear user TLB entries within the context named in mm */ -static inline void flush_tlb_mm (struct mm_struct *mm) -{ - unsigned char oldctx; - unsigned char seg; - unsigned long i; - - oldctx = sun3_get_context(); - sun3_put_context(mm->context); - - for(i = 0; i < TASK_SIZE; i += SUN3_PMEG_SIZE) { - seg = sun3_get_segmap(i); - if(seg == SUN3_INVALID_PMEG) - continue; - - sun3_put_segmap(i, SUN3_INVALID_PMEG); - pmeg_alloc[seg] = 0; - pmeg_ctx[seg] = 0; - pmeg_vaddr[seg] = 0; - } - - sun3_put_context(oldctx); - -} - -/* Flush a single TLB page. In this case, we're limited to flushing a - single PMEG */ -static inline void flush_tlb_page (struct vm_area_struct *vma, - unsigned long addr) -{ - unsigned char oldctx; - unsigned char i; - - oldctx = sun3_get_context(); - sun3_put_context(vma->vm_mm->context); - addr &= ~SUN3_PMEG_MASK; - if((i = sun3_get_segmap(addr)) != SUN3_INVALID_PMEG) - { - pmeg_alloc[i] = 0; - pmeg_ctx[i] = 0; - pmeg_vaddr[i] = 0; - sun3_put_segmap (addr, SUN3_INVALID_PMEG); - } - sun3_put_context(oldctx); - -} -/* Flush a range of pages from TLB. */ - -static inline void flush_tlb_range (struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned char seg, oldctx; - - start &= ~SUN3_PMEG_MASK; - - oldctx = sun3_get_context(); - sun3_put_context(mm->context); - - while(start < end) - { - if((seg = sun3_get_segmap(start)) == SUN3_INVALID_PMEG) - goto next; - if(pmeg_ctx[seg] == mm->context) { - pmeg_alloc[seg] = 0; - pmeg_ctx[seg] = 0; - pmeg_vaddr[seg] = 0; - } - sun3_put_segmap(start, SUN3_INVALID_PMEG); - next: - start += SUN3_PMEG_SIZE; - } -} - -static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) -{ - flush_tlb_all(); -} - -/* Flush kernel page from TLB. */ -static inline void flush_tlb_kernel_page (unsigned long addr) -{ - sun3_put_segmap (addr & ~(SUN3_PMEG_SIZE - 1), SUN3_INVALID_PMEG); -} - -#endif - -#endif /* _M68K_TLBFLUSH_H */ diff --git a/arch/m68k/include/asm/tlbflush_no.h b/arch/m68k/include/asm/tlbflush_no.h deleted file mode 100644 index a470cfb803eb..000000000000 --- a/arch/m68k/include/asm/tlbflush_no.h +++ /dev/null @@ -1,55 +0,0 @@ -#ifndef _M68KNOMMU_TLBFLUSH_H -#define _M68KNOMMU_TLBFLUSH_H - -/* - * Copyright (C) 2000 Lineo, David McCullough - * Copyright (C) 2000-2002, Greg Ungerer - */ - -#include - -/* - * flush all user-space atc entries. - */ -static inline void __flush_tlb(void) -{ - BUG(); -} - -static inline void __flush_tlb_one(unsigned long addr) -{ - BUG(); -} - -#define flush_tlb() __flush_tlb() - -/* - * flush all atc entries (both kernel and user-space entries). - */ -static inline void flush_tlb_all(void) -{ - BUG(); -} - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - BUG(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) -{ - BUG(); -} - -static inline void flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - BUG(); -} - -static inline void flush_tlb_kernel_page(unsigned long addr) -{ - BUG(); -} - -#endif /* _M68KNOMMU_TLBFLUSH_H */ -- cgit v1.2.3-59-g8ed1b From 454dc5f8228d08e6e9876c45d7f9bfd8b745e39d Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Fri, 20 Mar 2009 09:53:59 +1000 Subject: m68knommu: switch to using generic_handle_irq() Switch to using generic_handle_irq() instead of the deprecated __do_IRQ(). Signed-off-by: Greg Ungerer --- arch/m68knommu/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68knommu/kernel/irq.c b/arch/m68knommu/kernel/irq.c index bba1bb48a21f..56e0f4c55a67 100644 --- a/arch/m68knommu/kernel/irq.c +++ b/arch/m68knommu/kernel/irq.c @@ -23,7 +23,7 @@ asmlinkage void do_IRQ(int irq, struct pt_regs *regs) struct pt_regs *oldregs = set_irq_regs(regs); irq_enter(); - __do_IRQ(irq); + generic_handle_irq(irq); irq_exit(); set_irq_regs(oldregs); -- cgit v1.2.3-59-g8ed1b From 4ce2cba45a46668409606bdb9923164b51986807 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 24 Mar 2009 15:15:14 +1000 Subject: m68knommu: fix end of uart table marker The UART platform data structure is missing an empty struct at the end (as the end of structure marker). Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/5249/config.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c index d299f7b8768a..9c926dedb7c2 100644 --- a/arch/m68knommu/platform/5249/config.c +++ b/arch/m68knommu/platform/5249/config.c @@ -32,7 +32,8 @@ static struct mcf_platform_uart m5249_uart_platform[] = { { .mapbase = MCF_MBAR + MCFUART_BASE2, .irq = 74, - } + }, + { }, }; static struct platform_device m5249_uart = { -- cgit v1.2.3-59-g8ed1b From de1fc5c629f1597ddc996379642f9f3594dcdfbe Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 24 Mar 2009 16:50:09 +1000 Subject: m68knommu: fix 5249 ColdFire UART setup The ICR registers of the 5249 ColdFire processor are 8bits, not 32bits. Fix the read/write of these register to be the correct size. Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/5249/config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c index 9c926dedb7c2..3b808c64e671 100644 --- a/arch/m68knommu/platform/5249/config.c +++ b/arch/m68knommu/platform/5249/config.c @@ -51,11 +51,11 @@ static struct platform_device *m5249_devices[] __initdata = { static void __init m5249_uart_init_line(int line, int irq) { if (line == 0) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); } else if (line == 1) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); } -- cgit v1.2.3-59-g8ed1b From d62db60659416499b763a00c953f23ed1c9a5c46 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 24 Mar 2009 17:14:32 +1000 Subject: m68knommu: fix 5249 ColdFire UART vector setup The address of the IVUR register is not correct, it should be offset into the MBAR region. Without this the vector is not set to the correct number. Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/5249/config.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c index 3b808c64e671..9eab19d01eb1 100644 --- a/arch/m68knommu/platform/5249/config.c +++ b/arch/m68knommu/platform/5249/config.c @@ -52,11 +52,11 @@ static void __init m5249_uart_init_line(int line, int irq) { if (line == 0) { writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); - writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); } else if (line == 1) { writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); - writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); } } -- cgit v1.2.3-59-g8ed1b From 1fc2d5c11918082536acf261ce6abb1f5511053f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:01 -0500 Subject: tracing/filters: use list_for_each_entry Impact: cleanup No need to use the safe version here, so use list_for_each_entry instead of list_for_each_entry_safe in find_event_field(). Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878841.8339.57.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4117c2ebc245..3f0b79f8a4bc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -147,9 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf) static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { - struct ftrace_event_field *field, *next; + struct ftrace_event_field *field; - list_for_each_entry_safe(field, next, &call->fields, link) { + list_for_each_entry(field, &call->fields, link) { if (!strcmp(field->name, name)) return field; } -- cgit v1.2.3-59-g8ed1b From 09f1f245c79585383de63e3ca54d0f91824bff3a Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:11 -0500 Subject: tracing/filters: free pred when clearing filters Impact: fix (small) per trace filter modification memory leak Free the current pred when clearing the filters via the filter files. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878851.8339.58.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fdab30d6c835..a9381384aa9e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -516,6 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (pred->clear) { filter_free_preds(call); + filter_free_pred(pred); return cnt; } @@ -581,6 +582,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (pred->clear) { filter_free_subsystem_preds(system); + filter_free_pred(pred); return cnt; } -- cgit v1.2.3-59-g8ed1b From 4bda2d517bfa3ce3d7044e06988cdddae7adffe2 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:31 -0500 Subject: tracing/filters: use trace_seq_printf() to print filters Impact: cleanup Instead of just using the trace_seq buffer to print the filters, use trace_seq_printf() as it was intended to be used. Reported-by: Steven Rostedt Signed-off-by: Tom Zanussi Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878871.8339.59.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 3 ++- kernel/trace/trace_events.c | 8 ++++---- kernel/trace/trace_events_filter.c | 25 +++++++++---------------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54fd9bcd0a65..90a848debcba 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -840,7 +840,8 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); extern void filter_free_pred(struct filter_pred *pred); -extern int filter_print_preds(struct filter_pred **preds, char *buf); +extern void filter_print_preds(struct filter_pred **preds, + struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a9381384aa9e..d132997ab756 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,8 +481,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - r = filter_print_preds(call->preds, s->buffer); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + filter_print_preds(call->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -547,8 +547,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - r = filter_print_preds(system->preds, s->buffer); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r); + filter_print_preds(system->preds, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3f0b79f8a4bc..9fca8bb1c06b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -24,6 +24,7 @@ #include #include "trace.h" +#include "trace_output.h" static int filter_pred_64(struct filter_pred *pred, void *event) { @@ -108,16 +109,15 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -int filter_print_preds(struct filter_pred **preds, char *buf) +void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) { - ssize_t this_len = 0; char *field_name; struct filter_pred *pred; int i; if (!preds) { - this_len += sprintf(buf + this_len, "none\n"); - return this_len; + trace_seq_printf(s, "none\n"); + return; } for (i = 0; i < MAX_FILTER_PRED; i++) { @@ -125,23 +125,16 @@ int filter_print_preds(struct filter_pred **preds, char *buf) pred = preds[i]; field_name = pred->field_name; if (i) - this_len += sprintf(buf + this_len, - pred->or ? "|| " : "&& "); - this_len += sprintf(buf + this_len, - "%s ", field_name); - this_len += sprintf(buf + this_len, - pred->not ? "!= " : "== "); + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); if (pred->str_val) - this_len += sprintf(buf + this_len, - "%s\n", pred->str_val); + trace_seq_printf(s, "%s\n", pred->str_val); else - this_len += sprintf(buf + this_len, - "%llu\n", pred->val); + trace_seq_printf(s, "%llu\n", pred->val); } else break; } - - return this_len; } static struct ftrace_event_field * -- cgit v1.2.3-59-g8ed1b From 9f58a159d022c8f2533a27708aa267adf4f0e3ce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 24 Mar 2009 02:14:42 -0500 Subject: tracing/filters: disallow integer values for string filters and vice versa Impact: fix filter use boundary condition / crash Make sure filters for string fields don't use integer values and vice versa. Getting it wrong can crash the system or produce bogus results. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878882.8339.61.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9fca8bb1c06b..026be412f356 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -237,9 +237,14 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) pred->offset = field->offset; if (is_string_field(field->type)) { + if (!pred->str_val) + return -EINVAL; pred->fn = filter_pred_string; pred->str_len = field->size; return __filter_add_pred(call, pred); + } else { + if (pred->str_val) + return -EINVAL; } switch (field->size) { -- cgit v1.2.3-59-g8ed1b From 9242ef12f0d174da1739a071fb4a5fc5de27905e Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 24 Mar 2009 17:22:05 +1000 Subject: m68knommu: fix 5307 ColdFire UART vector setup There is a couple of problems with the UART vector setup for the 5307 ColdFire UART. The ICR register access should be 8bit, not 32bit. The address of the UIVR register is wrong, it needs to be offset into the MBAR register region. Fix these. Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/5307/config.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/m68knommu/platform/5307/config.c b/arch/m68knommu/platform/5307/config.c index 724faf05852a..44803bf70a6e 100644 --- a/arch/m68knommu/platform/5307/config.c +++ b/arch/m68knommu/platform/5307/config.c @@ -65,12 +65,12 @@ static struct platform_device *m5307_devices[] __initdata = { static void __init m5307_uart_init_line(int line, int irq) { if (line == 0) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); - writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); } else if (line == 1) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); - writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); } } -- cgit v1.2.3-59-g8ed1b From 522b3d49179a0d2dc4e152b77eb82fbfe97782f4 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 24 Mar 2009 17:33:41 +1000 Subject: m68knommu: fix 5407 ColdFire UART vector setup There is a couple of problems with the UART vector setup for the 5307 ColdFire UART. The ICR register access should be 8bit, not 32bit. The address of the UIVR register is wrong, it needs to be offset into the MBAR register region. Fix these. Signed-off-by: Greg Ungerer --- arch/m68knommu/platform/5407/config.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/m68knommu/platform/5407/config.c b/arch/m68knommu/platform/5407/config.c index 648b8b778211..0ee8c1a200c8 100644 --- a/arch/m68knommu/platform/5407/config.c +++ b/arch/m68knommu/platform/5407/config.c @@ -56,12 +56,12 @@ static struct platform_device *m5407_devices[] __initdata = { static void __init m5407_uart_init_line(int line, int irq) { if (line == 0) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); - writeb(irq, MCFUART_BASE1 + MCFUART_UIVR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI1, MCF_MBAR + MCFSIM_UART1ICR); + writeb(irq, MCF_MBAR + MCFUART_BASE1 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART1); } else if (line == 1) { - writel(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); - writeb(irq, MCFUART_BASE2 + MCFUART_UIVR); + writeb(MCFSIM_ICR_LEVEL6 | MCFSIM_ICR_PRI2, MCF_MBAR + MCFSIM_UART2ICR); + writeb(irq, MCF_MBAR + MCFUART_BASE2 + MCFUART_UIVR); mcf_setimr(mcf_getimr() & ~MCFSIM_IMR_UART2); } } -- cgit v1.2.3-59-g8ed1b From 402d326519c1a4859c527702383f4e60f606ef52 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Feb 2009 10:40:00 +0000 Subject: NOMMU: Present backing device capabilities for MTD chardevs Present backing device capabilities for MTD character device files to allow NOMMU mmap to do direct mapping where possible. Signed-off-by: David Howells Tested-by: Bernd Schmidt Signed-off-by: David Woodhouse --- drivers/mtd/Makefile | 2 +- drivers/mtd/chips/map_ram.c | 17 ++++++++++++ drivers/mtd/chips/map_rom.c | 17 ++++++++++++ drivers/mtd/devices/mtdram.c | 14 ++++++++++ drivers/mtd/internal.h | 17 ++++++++++++ drivers/mtd/mtdbdi.c | 43 ++++++++++++++++++++++++++++++ drivers/mtd/mtdchar.c | 63 +++++++++++++++++++++++++++++++++++++++++++- drivers/mtd/mtdcore.c | 15 +++++++++++ drivers/mtd/mtdpart.c | 15 +++++++++++ include/linux/mtd/mtd.h | 14 ++++++++++ 10 files changed, 215 insertions(+), 2 deletions(-) create mode 100644 drivers/mtd/internal.h create mode 100644 drivers/mtd/mtdbdi.c diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index 4521b1ecce45..82d1e4de475b 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -4,7 +4,7 @@ # Core functionality. obj-$(CONFIG_MTD) += mtd.o -mtd-y := mtdcore.o mtdsuper.o +mtd-y := mtdcore.o mtdsuper.o mtdbdi.o mtd-$(CONFIG_MTD_PARTITIONS) += mtdpart.o obj-$(CONFIG_MTD_CONCAT) += mtdconcat.o diff --git a/drivers/mtd/chips/map_ram.c b/drivers/mtd/chips/map_ram.c index 072dd8abf33a..6bdc50c727e7 100644 --- a/drivers/mtd/chips/map_ram.c +++ b/drivers/mtd/chips/map_ram.c @@ -21,6 +21,8 @@ static int mapram_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch static int mapram_erase (struct mtd_info *, struct erase_info *); static void mapram_nop (struct mtd_info *); static struct mtd_info *map_ram_probe(struct map_info *map); +static unsigned long mapram_unmapped_area(struct mtd_info *, unsigned long, + unsigned long, unsigned long); static struct mtd_chip_driver mapram_chipdrv = { @@ -64,6 +66,7 @@ static struct mtd_info *map_ram_probe(struct map_info *map) mtd->type = MTD_RAM; mtd->size = map->size; mtd->erase = mapram_erase; + mtd->get_unmapped_area = mapram_unmapped_area; mtd->read = mapram_read; mtd->write = mapram_write; mtd->sync = mapram_nop; @@ -79,6 +82,20 @@ static struct mtd_info *map_ram_probe(struct map_info *map) } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long mapram_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct map_info *map = mtd->priv; + return (unsigned long) map->virt + offset; +} + static int mapram_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct map_info *map = mtd->priv; diff --git a/drivers/mtd/chips/map_rom.c b/drivers/mtd/chips/map_rom.c index c76d6e5f47ee..076090a67b90 100644 --- a/drivers/mtd/chips/map_rom.c +++ b/drivers/mtd/chips/map_rom.c @@ -20,6 +20,8 @@ static int maprom_write (struct mtd_info *, loff_t, size_t, size_t *, const u_ch static void maprom_nop (struct mtd_info *); static struct mtd_info *map_rom_probe(struct map_info *map); static int maprom_erase (struct mtd_info *mtd, struct erase_info *info); +static unsigned long maprom_unmapped_area(struct mtd_info *, unsigned long, + unsigned long, unsigned long); static struct mtd_chip_driver maprom_chipdrv = { .probe = map_rom_probe, @@ -40,6 +42,7 @@ static struct mtd_info *map_rom_probe(struct map_info *map) mtd->name = map->name; mtd->type = MTD_ROM; mtd->size = map->size; + mtd->get_unmapped_area = maprom_unmapped_area; mtd->read = maprom_read; mtd->write = maprom_write; mtd->sync = maprom_nop; @@ -53,6 +56,20 @@ static struct mtd_info *map_rom_probe(struct map_info *map) } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long maprom_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct map_info *map = mtd->priv; + return (unsigned long) map->virt + offset; +} + static int maprom_read (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct map_info *map = mtd->priv; diff --git a/drivers/mtd/devices/mtdram.c b/drivers/mtd/devices/mtdram.c index 3aaca88847d3..fce5ff7589aa 100644 --- a/drivers/mtd/devices/mtdram.c +++ b/drivers/mtd/devices/mtdram.c @@ -65,6 +65,19 @@ static void ram_unpoint(struct mtd_info *mtd, loff_t from, size_t len) { } +/* + * Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ +static unsigned long ram_get_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + return (unsigned long) mtd->priv + offset; +} + static int ram_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { @@ -116,6 +129,7 @@ int mtdram_init_device(struct mtd_info *mtd, void *mapped_address, mtd->erase = ram_erase; mtd->point = ram_point; mtd->unpoint = ram_unpoint; + mtd->get_unmapped_area = ram_get_unmapped_area; mtd->read = ram_read; mtd->write = ram_write; diff --git a/drivers/mtd/internal.h b/drivers/mtd/internal.h new file mode 100644 index 000000000000..c658fe7216b5 --- /dev/null +++ b/drivers/mtd/internal.h @@ -0,0 +1,17 @@ +/* Internal MTD definitions + * + * Copyright © 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * mtdbdi.c + */ +extern struct backing_dev_info mtd_bdi_unmappable; +extern struct backing_dev_info mtd_bdi_ro_mappable; +extern struct backing_dev_info mtd_bdi_rw_mappable; diff --git a/drivers/mtd/mtdbdi.c b/drivers/mtd/mtdbdi.c new file mode 100644 index 000000000000..5ca5aed0b225 --- /dev/null +++ b/drivers/mtd/mtdbdi.c @@ -0,0 +1,43 @@ +/* MTD backing device capabilities + * + * Copyright © 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * backing device capabilities for non-mappable devices (such as NAND flash) + * - permits private mappings, copies are taken of the data + */ +struct backing_dev_info mtd_bdi_unmappable = { + .capabilities = BDI_CAP_MAP_COPY, +}; + +/* + * backing device capabilities for R/O mappable devices (such as ROM) + * - permits private mappings, copies are taken of the data + * - permits non-writable shared mappings + */ +struct backing_dev_info mtd_bdi_ro_mappable = { + .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | + BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP), +}; + +/* + * backing device capabilities for writable mappable devices (such as RAM) + * - permits private mappings, copies are taken of the data + * - permits non-writable shared mappings + */ +struct backing_dev_info mtd_bdi_rw_mappable = { + .capabilities = (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT | + BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP | + BDI_CAP_WRITE_MAP), +}; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 2c8a16f49ca2..f478f1fc3949 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -107,12 +108,15 @@ static int mtd_open(struct inode *inode, struct file *file) goto out; } - if (MTD_ABSENT == mtd->type) { + if (mtd->type == MTD_ABSENT) { put_mtd_device(mtd); ret = -ENODEV; goto out; } + if (mtd->backing_dev_info) + file->f_mapping->backing_dev_info = mtd->backing_dev_info; + /* You can't open it RW if it's not a writeable device */ if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) { put_mtd_device(mtd); @@ -781,6 +785,59 @@ static int mtd_ioctl(struct inode *inode, struct file *file, return ret; } /* memory_ioctl */ +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + */ +#ifndef CONFIG_MMU +static unsigned long mtd_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct mtd_file_info *mfi = file->private_data; + struct mtd_info *mtd = mfi->mtd; + + if (mtd->get_unmapped_area) { + unsigned long offset; + + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset = pgoff << PAGE_SHIFT; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + + /* can't map directly */ + return (unsigned long) -ENOSYS; +} +#endif + +/* + * set up a mapping for shared memory segments + */ +static int mtd_mmap(struct file *file, struct vm_area_struct *vma) +{ +#ifdef CONFIG_MMU + struct mtd_file_info *mfi = file->private_data; + struct mtd_info *mtd = mfi->mtd; + + if (mtd->type == MTD_RAM || mtd->type == MTD_ROM) + return 0; + return -ENOSYS; +#else + return vma->vm_flags & VM_SHARED ? 0 : -ENOSYS; +#endif +} + static const struct file_operations mtd_fops = { .owner = THIS_MODULE, .llseek = mtd_lseek, @@ -789,6 +846,10 @@ static const struct file_operations mtd_fops = { .ioctl = mtd_ioctl, .open = mtd_open, .release = mtd_close, + .mmap = mtd_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = mtd_get_unmapped_area, +#endif }; static int __init init_mtdchar(void) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 76fe0a1e7a5e..65a7f3703881 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -19,6 +19,7 @@ #include #include +#include "internal.h" #include "mtdcore.h" @@ -46,6 +47,20 @@ int add_mtd_device(struct mtd_info *mtd) { int i; + if (!mtd->backing_dev_info) { + switch (mtd->type) { + case MTD_RAM: + mtd->backing_dev_info = &mtd_bdi_rw_mappable; + break; + case MTD_ROM: + mtd->backing_dev_info = &mtd_bdi_ro_mappable; + break; + default: + mtd->backing_dev_info = &mtd_bdi_unmappable; + break; + } + } + BUG_ON(mtd->writesize == 0); mutex_lock(&mtd_table_mutex); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 144e6b613a77..06d5b9d8853a 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -84,6 +84,18 @@ static void part_unpoint(struct mtd_info *mtd, loff_t from, size_t len) part->master->unpoint(part->master, from + part->offset, len); } +static unsigned long part_get_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct mtd_part *part = PART(mtd); + + offset += part->offset; + return part->master->get_unmapped_area(part->master, len, offset, + flags); +} + static int part_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { @@ -342,6 +354,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, slave->mtd.name = part->name; slave->mtd.owner = master->owner; + slave->mtd.backing_dev_info = master->backing_dev_info; slave->mtd.read = part_read; slave->mtd.write = part_write; @@ -354,6 +367,8 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, slave->mtd.unpoint = part_unpoint; } + if (master->get_unmapped_area) + slave->mtd.get_unmapped_area = part_get_unmapped_area; if (master->read_oob) slave->mtd.read_oob = part_read_oob; if (master->write_oob) diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 3aa5d77c2cdb..0c079fd307a5 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -162,6 +162,20 @@ struct mtd_info { /* We probably shouldn't allow XIP if the unpoint isn't a NULL */ void (*unpoint) (struct mtd_info *mtd, loff_t from, size_t len); + /* Allow NOMMU mmap() to directly map the device (if not NULL) + * - return the address to which the offset maps + * - return -ENOSYS to indicate refusal to do the mapping + */ + unsigned long (*get_unmapped_area) (struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags); + + /* Backing device capabilities for this device + * - provides mmap capabilities + */ + struct backing_dev_info *backing_dev_info; + int (*read) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf); int (*write) (struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf); -- cgit v1.2.3-59-g8ed1b From 6e232cfce35a20a8702d9ac7709d35030c1b3271 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Feb 2009 10:40:05 +0000 Subject: NOMMU: Add support for direct mapping through mtdconcat if possible Add support for direct mapping through mtdconcat, if possible, by attaching the samebacking_dev_info structure to the master. It has some restrictions: (1) It won't permit direct mapping of concatenated devices that have differing BDIs. (2) It doesn't support maps that span the 'gap' between devices, although it possibly could if the devices spanned across return compatible (ie. contiguous) addresses from their get_unmapped_area() ops. Signed-off-by: Gavin Lambert Signed-off-by: David Howells Tested-by: Bernd Schmidt Signed-off-by: David Woodhouse --- drivers/mtd/mtdconcat.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 3dbb1b38db66..792b547786b8 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -683,6 +684,40 @@ static int concat_block_markbad(struct mtd_info *mtd, loff_t ofs) return err; } +/* + * try to support NOMMU mmaps on concatenated devices + * - we don't support subdev spanning as we can't guarantee it'll work + */ +static unsigned long concat_get_unmapped_area(struct mtd_info *mtd, + unsigned long len, + unsigned long offset, + unsigned long flags) +{ + struct mtd_concat *concat = CONCAT(mtd); + int i; + + for (i = 0; i < concat->num_subdev; i++) { + struct mtd_info *subdev = concat->subdev[i]; + + if (offset >= subdev->size) { + offset -= subdev->size; + continue; + } + + /* we've found the subdev over which the mapping will reside */ + if (offset + len > subdev->size) + return (unsigned long) -EINVAL; + + if (subdev->get_unmapped_area) + return subdev->get_unmapped_area(subdev, len, offset, + flags); + + break; + } + + return (unsigned long) -ENOSYS; +} + /* * This function constructs a virtual MTD device by concatenating * num_devs MTD devices. A pointer to the new device object is @@ -740,6 +775,8 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks; + concat->mtd.backing_dev_info = subdev[0]->backing_dev_info; + concat->subdev[0] = subdev[0]; for (i = 1; i < num_devs; i++) { @@ -766,6 +803,15 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.flags |= subdev[i]->flags & MTD_WRITEABLE; } + + /* only permit direct mapping if the BDIs are all the same + * - copy-mapping is still permitted + */ + if (concat->mtd.backing_dev_info != + subdev[i]->backing_dev_info) + concat->mtd.backing_dev_info = + &default_backing_dev_info; + concat->mtd.size += subdev[i]->size; concat->mtd.ecc_stats.badblocks += subdev[i]->ecc_stats.badblocks; @@ -796,6 +842,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.unlock = concat_unlock; concat->mtd.suspend = concat_suspend; concat->mtd.resume = concat_resume; + concat->mtd.get_unmapped_area = concat_get_unmapped_area; /* * Combine the erase block size info of the subdevices: -- cgit v1.2.3-59-g8ed1b From da4458bda237aa0cb1688f6c359477f203788f6a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Feb 2009 10:40:10 +0000 Subject: NOMMU: Make it possible for RomFS to use MTD devices directly Change RomFS so that it can use MTD devices directly - without the intercession of the block layer - as well as using block devices. This permits RomFS: (1) to use the MTD direct mapping facility available under NOMMU conditions if the underlying device is directly accessible by the CPU (including XIP); (2) and thus to be used when the block layer is disabled. RomFS can be configured with support just for MTD devices, just for Block devices or for both. If RomFS is configured for both, then it will treat mtdblock device files as MTD backing stores, not block layer backing stores. I tested this using a CONFIG_MMU=n CONFIG_BLOCK=n kernel running on my FRV board with a RomFS image installed on the mtdram test device. I see my test program being run XIP: # cat /proc/maps ... c0c000b0-c0c01f8c r-xs 00000000 1f:00 144 /mnt/doshm ... GDB on the kernel can be used to show that these addresses are within the set-aside RAM space. Signed-off-by: David Howells Tested-by: Bernd Schmidt Signed-off-by: David Woodhouse --- fs/romfs/Kconfig | 24 ++ fs/romfs/Makefile | 9 +- fs/romfs/inode.c | 665 -------------------------------------------------- fs/romfs/internal.h | 47 ++++ fs/romfs/mmap-nommu.c | 75 ++++++ fs/romfs/storage.c | 261 ++++++++++++++++++++ fs/romfs/super.c | 648 ++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1062 insertions(+), 667 deletions(-) delete mode 100644 fs/romfs/inode.c create mode 100644 fs/romfs/internal.h create mode 100644 fs/romfs/mmap-nommu.c create mode 100644 fs/romfs/storage.c create mode 100644 fs/romfs/super.c diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 1a17020f9faf..802c742f002c 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -14,3 +14,27 @@ config ROMFS_FS If you don't know whether you need it, then you don't need it: answer N. + +config ROMFS_ON_BLOCK + bool "Block device-backed ROM file system support" if (ROMFS_ON_MTD && EMBEDDED) + depends on ROMFS_FS && BLOCK + help + This permits ROMFS to use block devices buffered through the page + cache as the medium from which to retrieve data. It does not allow + direct mapping of the medium. + + If unsure, answer Y. + +config ROMFS_ON_MTD + bool "MTD-backed ROM file system support" + depends on ROMFS_FS + depends on MTD=y || (ROMFS_FS=m && MTD) + help + This permits ROMFS to use MTD based devices directly, without the + intercession of the block layer (which may have been disabled). It + also allows direct mapping of MTD devices through romfs files under + NOMMU conditions if the underlying device is directly addressable by + the CPU. + + If unsure, answer Y. + diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile index c95b21cf49a3..420beb7d495c 100644 --- a/fs/romfs/Makefile +++ b/fs/romfs/Makefile @@ -1,7 +1,12 @@ # -# Makefile for the linux romfs filesystem routines. +# Makefile for the linux RomFS filesystem routines. # obj-$(CONFIG_ROMFS_FS) += romfs.o -romfs-objs := inode.o +romfs-y := storage.o super.o + +ifneq ($(CONFIG_MMU),y) +romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o +endif + diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c deleted file mode 100644 index 98a232f7196b..000000000000 --- a/fs/romfs/inode.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * ROMFS file system, Linux implementation - * - * Copyright (C) 1997-1999 Janos Farkas - * - * Using parts of the minix filesystem - * Copyright (C) 1991, 1992 Linus Torvalds - * - * and parts of the affs filesystem additionally - * Copyright (C) 1993 Ray Burr - * Copyright (C) 1996 Hans-Joachim Widmaier - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes - * Changed for 2.1.19 modules - * Jan 1997 Initial release - * Jun 1997 2.1.43+ changes - * Proper page locking in readpage - * Changed to work with 2.1.45+ fs - * Jul 1997 Fixed follow_link - * 2.1.47 - * lookup shouldn't return -ENOENT - * from Horst von Brand: - * fail on wrong checksum - * double unlock_super was possible - * correct namelen for statfs - * spotted by Bill Hawes: - * readlink shouldn't iput() - * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() - * exposed a problem in readdir - * 2.1.107 code-freeze spellchecker run - * Aug 1998 2.1.118+ VFS changes - * Sep 1998 2.1.122 another VFS change (follow_link) - * Apr 1999 2.2.7 no more EBADF checking in - * lookup/readdir, use ERR_PTR - * Jun 1999 2.3.6 d_alloc_root use changed - * 2.3.9 clean up usage of ENOENT/negative - * dentries in lookup - * clean up page flags setting - * (error, uptodate, locking) in - * in readpage - * use init_special_inode for - * fifos/sockets (and streamline) in - * read_inode, fix _ops table order - * Aug 1999 2.3.16 __initfunc() => __init change - * Oct 1999 2.3.24 page->owner hack obsoleted - * Nov 1999 2.3.27 2.3.25+ page->offset => index change - */ - -/* todo: - * - see Documentation/filesystems/romfs.txt - * - use allocated, not stack memory for file names? - * - considering write access... - * - network (tftp) files? - * - merge back some _op tables - */ - -/* - * Sorry about some optimizations and for some goto's. I just wanted - * to squeeze some more bytes out of this code.. :) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -struct romfs_inode_info { - unsigned long i_metasize; /* size of non-data area */ - unsigned long i_dataoffset; /* from the start of fs */ - struct inode vfs_inode; -}; - -static struct inode *romfs_iget(struct super_block *, unsigned long); - -/* instead of private superblock data */ -static inline unsigned long romfs_maxsize(struct super_block *sb) -{ - return (unsigned long)sb->s_fs_info; -} - -static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) -{ - return container_of(inode, struct romfs_inode_info, vfs_inode); -} - -static __u32 -romfs_checksum(void *data, int size) -{ - __u32 sum; - __be32 *ptr; - - sum = 0; ptr = data; - size>>=2; - while (size>0) { - sum += be32_to_cpu(*ptr++); - size--; - } - return sum; -} - -static const struct super_operations romfs_ops; - -static int romfs_fill_super(struct super_block *s, void *data, int silent) -{ - struct buffer_head *bh; - struct romfs_super_block *rsb; - struct inode *root; - int sz, ret = -EINVAL; - - /* I would parse the options here, but there are none.. :) */ - - sb_set_blocksize(s, ROMBSIZE); - s->s_maxbytes = 0xFFFFFFFF; - - bh = sb_bread(s, 0); - if (!bh) { - /* XXX merge with other printk? */ - printk ("romfs: unable to read superblock\n"); - goto outnobh; - } - - rsb = (struct romfs_super_block *)bh->b_data; - sz = be32_to_cpu(rsb->size); - if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 - || sz < ROMFH_SIZE) { - if (!silent) - printk ("VFS: Can't find a romfs filesystem on dev " - "%s.\n", s->s_id); - goto out; - } - if (romfs_checksum(rsb, min_t(int, sz, 512))) { - printk ("romfs: bad initial checksum on dev " - "%s.\n", s->s_id); - goto out; - } - - s->s_magic = ROMFS_MAGIC; - s->s_fs_info = (void *)(long)sz; - - s->s_flags |= MS_RDONLY; - - /* Find the start of the fs */ - sz = (ROMFH_SIZE + - strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD) - & ROMFH_MASK; - - s->s_op = &romfs_ops; - root = romfs_iget(s, sz); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } - - ret = -ENOMEM; - s->s_root = d_alloc_root(root); - if (!s->s_root) - goto outiput; - - brelse(bh); - return 0; - -outiput: - iput(root); -out: - brelse(bh); -outnobh: - return ret; -} - -/* That's simple too. */ - -static int -romfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - buf->f_type = ROMFS_MAGIC; - buf->f_bsize = ROMBSIZE; - buf->f_bfree = buf->f_bavail = buf->f_ffree; - buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS; - buf->f_namelen = ROMFS_MAXFN; - return 0; -} - -/* some helper routines */ - -static int -romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize) - return -1; - - /* strnlen is almost always valid */ - if (count > maxsize || offset+count > maxsize) - count = maxsize-offset; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize); - brelse(bh); - - if (res < maxsize) - return res; /* found all of it */ - - while (res < count) { - offset += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - avail = strnlen(bh->b_data, maxsize); - res += avail; - brelse(bh); - if (avail < maxsize) - return res; - } - return res; -} - -static int -romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize || count > maxsize || offset+count>maxsize) - return -1; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize); - brelse(bh); - - res = maxsize; /* all of it */ - - while (res < count) { - offset += maxsize; - dest += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - memcpy(dest, bh->b_data, maxsize); - brelse(bh); - res += maxsize; - } - return res; -} - -static unsigned char romfs_dtype_table[] = { - DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO -}; - -static int -romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *i = filp->f_path.dentry->d_inode; - struct romfs_inode ri; - unsigned long offset, maxoff; - int j, ino, nextfh; - int stored = 0; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - - lock_kernel(); - - maxoff = romfs_maxsize(i->i_sb); - - offset = filp->f_pos; - if (!offset) { - offset = i->i_ino & ROMFH_MASK; - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - /* Not really failsafe, but we are read-only... */ - for(;;) { - if (!offset || offset >= maxoff) { - offset = maxoff; - filp->f_pos = offset; - goto out; - } - filp->f_pos = offset; - - /* Fetch inode info */ - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - - j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (j < 0) - goto out; - - fsname[j]=0; - romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j); - - ino = offset; - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) - ino = be32_to_cpu(ri.spec); - if (filldir(dirent, fsname, j, offset, ino, - romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) { - goto out; - } - stored++; - offset = nextfh & ROMFH_MASK; - } -out: - unlock_kernel(); - return stored; -} - -static struct dentry * -romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - unsigned long offset, maxoff; - long res; - int fslen; - struct inode *inode = NULL; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - struct romfs_inode ri; - const char *name; /* got from dentry */ - int len; - - res = -EACCES; /* placeholder for "no data here" */ - offset = dir->i_ino & ROMFH_MASK; - lock_kernel(); - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - maxoff = romfs_maxsize(dir->i_sb); - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - /* OK, now find the file whose name is in "dentry" in the - * directory specified by "dir". */ - - name = dentry->d_name.name; - len = dentry->d_name.len; - - for(;;) { - if (!offset || offset >= maxoff) - goto success; /* negative success */ - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - /* try to match the first 16 bytes of name */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE); - if (len < ROMFH_SIZE) { - if (len == fslen) { - /* both are shorter, and same size */ - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp (name, fsname, len) == 0) - break; - } - } else if (fslen >= ROMFH_SIZE) { - /* both are longer; XXX optimize max size */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (len == fslen) { - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp(name, fsname, len) == 0) - break; - } - } - /* next entry */ - offset = be32_to_cpu(ri.next) & ROMFH_MASK; - } - - /* Hard link handling */ - if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - inode = romfs_iget(dir->i_sb, offset); - if (IS_ERR(inode)) { - res = PTR_ERR(inode); - goto error; - } - -success: - d_add(dentry, inode); - res = 0; -error: - unlock_kernel(); - return ERR_PTR(res); -} - -/* - * Ok, we do readpage, to be able to execute programs. Unfortunately, - * we can't use bmap, since we may have looser alignments. - */ - -static int -romfs_readpage(struct file *file, struct page * page) -{ - struct inode *inode = page->mapping->host; - loff_t offset, size; - unsigned long filled; - void *buf; - int result = -EIO; - - page_cache_get(page); - lock_kernel(); - buf = kmap(page); - if (!buf) - goto err_out; - - /* 32 bit warning -- but not for us :) */ - offset = page_offset(page); - size = i_size_read(inode); - filled = 0; - result = 0; - if (offset < size) { - unsigned long readlen; - - size -= offset; - readlen = size > PAGE_SIZE ? PAGE_SIZE : size; - - filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen); - - if (filled != readlen) { - SetPageError(page); - filled = 0; - result = -EIO; - } - } - - if (filled < PAGE_SIZE) - memset(buf + filled, 0, PAGE_SIZE-filled); - - if (!result) - SetPageUptodate(page); - flush_dcache_page(page); - - unlock_page(page); - - kunmap(page); -err_out: - page_cache_release(page); - unlock_kernel(); - - return result; -} - -/* Mapping from our types to the kernel */ - -static const struct address_space_operations romfs_aops = { - .readpage = romfs_readpage -}; - -static const struct file_operations romfs_dir_operations = { - .read = generic_read_dir, - .readdir = romfs_readdir, -}; - -static const struct inode_operations romfs_dir_inode_operations = { - .lookup = romfs_lookup, -}; - -static mode_t romfs_modemap[] = -{ - 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777, - S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644 -}; - -static struct inode * -romfs_iget(struct super_block *sb, unsigned long ino) -{ - int nextfh, ret; - struct romfs_inode ri; - struct inode *i; - - ino &= ROMFH_MASK; - i = iget_locked(sb, ino); - if (!i) - return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) - return i; - - i->i_mode = 0; - - /* Loop for finding the real hard link */ - for(;;) { - if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) { - printk(KERN_ERR "romfs: read error for inode 0x%lx\n", - ino); - iget_failed(i); - return ERR_PTR(-EIO); - } - /* XXX: do romfs_checksum here too (with name) */ - - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) - break; - - ino = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - i->i_nlink = 1; /* Hard to decide.. */ - i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; - - /* Precalculate the data offset */ - ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); - if (ret >= 0) - ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; - else - ino = 0; - - ROMFS_I(i)->i_metasize = ino; - ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); - - /* Compute permissions */ - ino = romfs_modemap[nextfh & ROMFH_TYPE]; - /* only "normal" files have ops */ - switch (nextfh & ROMFH_TYPE) { - case 1: - i->i_size = ROMFS_I(i)->i_metasize; - i->i_op = &romfs_dir_inode_operations; - i->i_fop = &romfs_dir_operations; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 2: - i->i_fop = &generic_ro_fops; - i->i_data.a_ops = &romfs_aops; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 3: - i->i_op = &page_symlink_inode_operations; - i->i_data.a_ops = &romfs_aops; - i->i_mode = ino | S_IRWXUGO; - break; - default: - /* depending on MBZ for sock/fifos */ - nextfh = be32_to_cpu(ri.spec); - init_special_inode(i, ino, - MKDEV(nextfh>>16,nextfh&0xffff)); - } - unlock_new_inode(i); - return i; -} - -static struct kmem_cache * romfs_inode_cachep; - -static struct inode *romfs_alloc_inode(struct super_block *sb) -{ - struct romfs_inode_info *ei; - ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void romfs_destroy_inode(struct inode *inode) -{ - kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); -} - -static void init_once(void *foo) -{ - struct romfs_inode_info *ei = foo; - - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - romfs_inode_cachep = kmem_cache_create("romfs_inode_cache", - sizeof(struct romfs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (romfs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(romfs_inode_cachep); -} - -static int romfs_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_RDONLY; - return 0; -} - -static const struct super_operations romfs_ops = { - .alloc_inode = romfs_alloc_inode, - .destroy_inode = romfs_destroy_inode, - .statfs = romfs_statfs, - .remount_fs = romfs_remount, -}; - -static int romfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super, - mnt); -} - -static struct file_system_type romfs_fs_type = { - .owner = THIS_MODULE, - .name = "romfs", - .get_sb = romfs_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -static int __init init_romfs_fs(void) -{ - int err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&romfs_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - return err; -} - -static void __exit exit_romfs_fs(void) -{ - unregister_filesystem(&romfs_fs_type); - destroy_inodecache(); -} - -/* Yes, works even as a module... :) */ - -module_init(init_romfs_fs) -module_exit(exit_romfs_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h new file mode 100644 index 000000000000..06044a9dc62d --- /dev/null +++ b/fs/romfs/internal.h @@ -0,0 +1,47 @@ +/* RomFS internal definitions + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +struct romfs_inode_info { + struct inode vfs_inode; + unsigned long i_metasize; /* size of non-data area */ + unsigned long i_dataoffset; /* from the start of fs */ +}; + +static inline size_t romfs_maxsize(struct super_block *sb) +{ + return (size_t) (unsigned long) sb->s_fs_info; +} + +static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) +{ + return container_of(inode, struct romfs_inode_info, vfs_inode); +} + +/* + * mmap-nommu.c + */ +#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) +extern const struct file_operations romfs_ro_fops; +#else +#define romfs_ro_fops generic_ro_fops +#endif + +/* + * storage.c + */ +extern int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen); +extern ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen); +extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c new file mode 100644 index 000000000000..f0511e816967 --- /dev/null +++ b/fs/romfs/mmap-nommu.c @@ -0,0 +1,75 @@ +/* NOMMU mmap support for RomFS on MTD devices + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "internal.h" + +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + * - attempts to map through to the underlying MTD device + */ +static unsigned long romfs_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct inode *inode = file->f_mapping->host; + struct mtd_info *mtd = inode->i_sb->s_mtd; + unsigned long isize, offset; + + if (!mtd) + goto cant_map_directly; + + isize = i_size_read(inode); + offset = pgoff << PAGE_SHIFT; + if (offset > isize || len > isize || offset > isize - len) + return (unsigned long) -EINVAL; + + /* we need to call down to the MTD layer to do the actual mapping */ + if (mtd->get_unmapped_area) { + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + +cant_map_directly: + return (unsigned long) -ENOSYS; +} + +/* + * permit a R/O mapping to be made directly through onto an MTD device if + * possible + */ +static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +const struct file_operations romfs_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .mmap = romfs_mmap, + .get_unmapped_area = romfs_get_unmapped_area, +}; diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c new file mode 100644 index 000000000000..7e3e1e12a081 --- /dev/null +++ b/fs/romfs/storage.c @@ -0,0 +1,261 @@ +/* RomFS storage access routines + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "internal.h" + +#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) +#error no ROMFS backing store interface configured +#endif + +#ifdef CONFIG_ROMFS_ON_MTD +#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__)) + +/* + * read data from an romfs image on an MTD device + */ +static int romfs_mtd_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t rlen; + int ret; + + ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); + return (ret < 0 || rlen != buflen) ? -EIO : 0; +} + +/* + * determine the length of a string in a romfs image on an MTD device + */ +static ssize_t romfs_mtd_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + ssize_t n = 0; + size_t segment; + u_char buf[16], *p; + size_t len; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (maxlen > 0) { + segment = min_t(size_t, maxlen, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + p = memchr(buf, 0, len); + if (p) + return n + (p - buf); + maxlen -= len; + pos += len; + n += len; + } + + return n; +} + +/* + * compare a string to one in a romfs image on MTD + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + u_char buf[16]; + size_t len, segment; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + segment = min_t(size_t, size, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + if (memcmp(buf, str, len) != 0) + return 0; + size -= len; + pos += len; + str += len; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_MTD */ + +#ifdef CONFIG_ROMFS_ON_BLOCK +/* + * read data from an romfs image on a block device + */ +static int romfs_blk_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + + /* copy the string up to blocksize bytes at a time */ + while (buflen > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, buflen, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + memcpy(buf, bh->b_data + offset, segment); + brelse(bh); + buflen -= segment; + pos += segment; + } + + return 0; +} + +/* + * determine the length of a string in romfs on a block device + */ +static ssize_t romfs_blk_strnlen(struct super_block *sb, + unsigned long pos, size_t limit) +{ + struct buffer_head *bh; + unsigned long offset; + ssize_t n = 0; + size_t segment; + u_char *buf, *p; + + /* scan the string up to blocksize bytes at a time */ + while (limit > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, limit, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + buf = bh->b_data + offset; + p = memchr(buf, 0, segment); + brelse(bh); + if (p) + return n + (p - buf); + limit -= segment; + pos += segment; + n += segment; + } + + return n; +} + +/* + * compare a string to one in a romfs image on a block device + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + bool x; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, size, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + x = (memcmp(bh->b_data + offset, str, segment) != 0); + brelse(bh); + if (x) + return 0; + size -= segment; + pos += segment; + str += segment; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_BLOCK */ + +/* + * read data from the romfs image + */ +int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (buflen > limit - pos) + buflen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_read(sb, pos, buf, buflen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_read(sb, pos, buf, buflen); +#endif + return -EIO; +} + +/* + * determine the length of a string in romfs + */ +ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (maxlen > limit - pos) + maxlen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strnlen(sb, pos, limit); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strnlen(sb, pos, limit); +#endif + return -EIO; +} + +/* + * compare a string to one in romfs + * - return 1 if matched, 0 if differ, -ve if error + */ +int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (size > ROMFS_MAXFN) + return -ENAMETOOLONG; + if (size > limit - pos) + return -EIO; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strncmp(sb, pos, str, size); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strncmp(sb, pos, str, size); +#endif + return -EIO; +} diff --git a/fs/romfs/super.c b/fs/romfs/super.c new file mode 100644 index 000000000000..1e548a4975ba --- /dev/null +++ b/fs/romfs/super.c @@ -0,0 +1,648 @@ +/* Block- or MTD-based romfs + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * Derived from: ROMFS file system, Linux implementation + * + * Copyright © 1997-1999 Janos Farkas + * + * Using parts of the minix filesystem + * Copyright © 1991, 1992 Linus Torvalds + * + * and parts of the affs filesystem additionally + * Copyright © 1993 Ray Burr + * Copyright © 1996 Hans-Joachim Widmaier + * + * Changes + * Changed for 2.1.19 modules + * Jan 1997 Initial release + * Jun 1997 2.1.43+ changes + * Proper page locking in readpage + * Changed to work with 2.1.45+ fs + * Jul 1997 Fixed follow_link + * 2.1.47 + * lookup shouldn't return -ENOENT + * from Horst von Brand: + * fail on wrong checksum + * double unlock_super was possible + * correct namelen for statfs + * spotted by Bill Hawes: + * readlink shouldn't iput() + * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() + * exposed a problem in readdir + * 2.1.107 code-freeze spellchecker run + * Aug 1998 2.1.118+ VFS changes + * Sep 1998 2.1.122 another VFS change (follow_link) + * Apr 1999 2.2.7 no more EBADF checking in + * lookup/readdir, use ERR_PTR + * Jun 1999 2.3.6 d_alloc_root use changed + * 2.3.9 clean up usage of ENOENT/negative + * dentries in lookup + * clean up page flags setting + * (error, uptodate, locking) in + * in readpage + * use init_special_inode for + * fifos/sockets (and streamline) in + * read_inode, fix _ops table order + * Aug 1999 2.3.16 __initfunc() => __init change + * Oct 1999 2.3.24 page->owner hack obsoleted + * Nov 1999 2.3.27 2.3.25+ page->offset => index change + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct kmem_cache *romfs_inode_cachep; + +static const umode_t romfs_modemap[8] = { + 0, /* hard link */ + S_IFDIR | 0644, /* directory */ + S_IFREG | 0644, /* regular file */ + S_IFLNK | 0777, /* symlink */ + S_IFBLK | 0600, /* blockdev */ + S_IFCHR | 0600, /* chardev */ + S_IFSOCK | 0644, /* socket */ + S_IFIFO | 0644 /* FIFO */ +}; + +static const unsigned char romfs_dtype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO +}; + +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); + +/* + * read a page worth of data from the image + */ +static int romfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t offset, size; + unsigned long fillsize, pos; + void *buf; + int ret; + + buf = kmap(page); + if (!buf) + return -ENOMEM; + + /* 32 bit warning -- but not for us :) */ + offset = page_offset(page); + size = i_size_read(inode); + fillsize = 0; + ret = 0; + if (offset < size) { + size -= offset; + fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; + + pos = ROMFS_I(inode)->i_dataoffset + offset; + + ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); + if (ret < 0) { + SetPageError(page); + fillsize = 0; + ret = -EIO; + } + } + + if (fillsize < PAGE_SIZE) + memset(buf + fillsize, 0, PAGE_SIZE - fillsize); + if (ret == 0) + SetPageUptodate(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static const struct address_space_operations romfs_aops = { + .readpage = romfs_readpage +}; + +/* + * read the entries from a directory + */ +static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *i = filp->f_dentry->d_inode; + struct romfs_inode ri; + unsigned long offset, maxoff; + int j, ino, nextfh; + int stored = 0; + char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ + int ret; + + maxoff = romfs_maxsize(i->i_sb); + + offset = filp->f_pos; + if (!offset) { + offset = i->i_ino & ROMFH_MASK; + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* Not really failsafe, but we are read-only... */ + for (;;) { + if (!offset || offset >= maxoff) { + offset = maxoff; + filp->f_pos = offset; + goto out; + } + filp->f_pos = offset; + + /* Fetch inode info */ + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + + j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, + sizeof(fsname) - 1); + if (j < 0) + goto out; + + ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); + if (ret < 0) + goto out; + fsname[j] = '\0'; + + ino = offset; + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) + ino = be32_to_cpu(ri.spec); + if (filldir(dirent, fsname, j, offset, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + goto out; + + stored++; + offset = nextfh & ROMFH_MASK; + } + +out: + return stored; +} + +/* + * look up an entry in a directory + */ +static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + unsigned long offset, maxoff; + struct inode *inode; + struct romfs_inode ri; + const char *name; /* got from dentry */ + int len, ret; + + offset = dir->i_ino & ROMFH_MASK; + ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto error; + + /* search all the file entries in the list starting from the one + * pointed to by the directory's special data */ + maxoff = romfs_maxsize(dir->i_sb); + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + for (;;) { + if (!offset || offset >= maxoff) + goto out0; + + ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* try to match the first 16 bytes of name */ + ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); + if (ret < 0) + goto error; + if (ret == 1) + break; + + /* next entry */ + offset = be32_to_cpu(ri.next) & ROMFH_MASK; + } + + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + inode = romfs_iget(dir->i_sb, offset); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto error; + } + goto outi; + + /* + * it's a bit funky, _lookup needs to return an error code + * (negative) or a NULL, both as a dentry. ENOENT should not + * be returned, instead we need to create a negative dentry by + * d_add(dentry, NULL); and return 0 as no error. + * (Although as I see, it only matters on writable file + * systems). + */ +out0: + inode = NULL; +outi: + d_add(dentry, inode); + ret = 0; +error: + return ERR_PTR(ret); +} + +static const struct file_operations romfs_dir_operations = { + .read = generic_read_dir, + .readdir = romfs_readdir, +}; + +static struct inode_operations romfs_dir_inode_operations = { + .lookup = romfs_lookup, +}; + +/* + * get a romfs inode based on its position in the image (which doubles as the + * inode number) + */ +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) +{ + struct romfs_inode_info *inode; + struct romfs_inode ri; + struct inode *i; + unsigned long nlen; + unsigned nextfh, ret; + umode_t mode; + + /* we might have to traverse a chain of "hard link" file entries to get + * to the actual file */ + for (;;) { + ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* XXX: do romfs_checksum here too (with name) */ + + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) + break; + + pos = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* determine the length of the filename */ + nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); + if (IS_ERR_VALUE(nlen)) + goto eio; + + /* get an inode for this image position */ + i = iget_locked(sb, pos); + if (!i) + return ERR_PTR(-ENOMEM); + + if (!(i->i_state & I_NEW)) + return i; + + /* precalculate the data offset */ + inode = ROMFS_I(i); + inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; + inode->i_dataoffset = pos + inode->i_metasize; + + i->i_nlink = 1; /* Hard to decide.. */ + i->i_size = be32_to_cpu(ri.size); + i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; + i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + + /* set up mode and ops */ + mode = romfs_modemap[nextfh & ROMFH_TYPE]; + + switch (nextfh & ROMFH_TYPE) { + case ROMFH_DIR: + i->i_size = ROMFS_I(i)->i_metasize; + i->i_op = &romfs_dir_inode_operations; + i->i_fop = &romfs_dir_operations; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_REG: + i->i_fop = &romfs_ro_fops; + i->i_data.a_ops = &romfs_aops; + if (i->i_sb->s_mtd) + i->i_data.backing_dev_info = + i->i_sb->s_mtd->backing_dev_info; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_SYM: + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &romfs_aops; + mode |= S_IRWXUGO; + break; + default: + /* depending on MBZ for sock/fifos */ + nextfh = be32_to_cpu(ri.spec); + init_special_inode(i, mode, MKDEV(nextfh >> 16, + nextfh & 0xffff)); + break; + } + + i->i_mode = mode; + + unlock_new_inode(i); + return i; + +eio: + ret = -EIO; +error: + printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + return ERR_PTR(ret); +} + +/* + * allocate a new inode + */ +static struct inode *romfs_alloc_inode(struct super_block *sb) +{ + struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + return inode ? &inode->vfs_inode : NULL; +} + +/* + * return a spent inode to the slab cache + */ +static void romfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); +} + +/* + * get filesystem statistics + */ +static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + buf->f_type = ROMFS_MAGIC; + buf->f_namelen = ROMFS_MAXFN; + buf->f_bsize = ROMBSIZE; + buf->f_bfree = buf->f_bavail = buf->f_ffree; + buf->f_blocks = + (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + return 0; +} + +/* + * remounting must involve read-only + */ +static int romfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations romfs_super_ops = { + .alloc_inode = romfs_alloc_inode, + .destroy_inode = romfs_destroy_inode, + .statfs = romfs_statfs, + .remount_fs = romfs_remount, +}; + +/* + * checksum check on part of a romfs filesystem + */ +static __u32 romfs_checksum(const void *data, int size) +{ + const __be32 *ptr = data; + __u32 sum; + + sum = 0; + size >>= 2; + while (size > 0) { + sum += be32_to_cpu(*ptr++); + size--; + } + return sum; +} + +/* + * fill in the superblock + */ +static int romfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct romfs_super_block *rsb; + struct inode *root; + unsigned long pos, img_size; + const char *storage; + size_t len; + int ret; + +#ifdef CONFIG_BLOCK + if (!sb->s_mtd) { + sb_set_blocksize(sb, ROMBSIZE); + } else { + sb->s_blocksize = ROMBSIZE; + sb->s_blocksize_bits = blksize_bits(ROMBSIZE); + } +#endif + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_magic = ROMFS_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_op = &romfs_super_ops; + + /* read the image superblock and check it */ + rsb = kmalloc(512, GFP_KERNEL); + if (!rsb) + return -ENOMEM; + + sb->s_fs_info = (void *) 512; + ret = romfs_dev_read(sb, 0, rsb, 512); + if (ret < 0) + goto error_rsb; + + img_size = be32_to_cpu(rsb->size); + + if (sb->s_mtd && img_size > sb->s_mtd->size) + goto error_rsb_inval; + + sb->s_fs_info = (void *) img_size; + + if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || + img_size < ROMFH_SIZE) { + if (!silent) + printk(KERN_WARNING "VFS:" + " Can't find a romfs filesystem on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { + printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + storage = sb->s_mtd ? "MTD" : "the block layer"; + + len = strnlen(rsb->name, ROMFS_MAXFN); + if (!silent) + printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); + + kfree(rsb); + rsb = NULL; + + /* find the root directory */ + pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; + + root = romfs_iget(sb, pos); + if (!root) + goto error; + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto error_i; + + return 0; + +error_i: + iput(root); +error: + return -EINVAL; +error_rsb_inval: + ret = -EINVAL; +error_rsb: + return ret; +} + +/* + * get a superblock for mounting + */ +static int romfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + int ret = -EINVAL; + +#ifdef CONFIG_ROMFS_ON_MTD + ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, + mnt); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (ret == -EINVAL) + ret = get_sb_bdev(fs_type, flags, dev_name, data, + romfs_fill_super, mnt); +#endif + return ret; +} + +/* + * destroy a romfs superblock in the appropriate manner + */ +static void romfs_kill_sb(struct super_block *sb) +{ +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) { + kill_mtd_super(sb); + return; + } +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) { + kill_block_super(sb); + return; + } +#endif +} + +static struct file_system_type romfs_fs_type = { + .owner = THIS_MODULE, + .name = "romfs", + .get_sb = romfs_get_sb, + .kill_sb = romfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * inode storage initialiser + */ +static void romfs_i_init_once(void *_inode) +{ + struct romfs_inode_info *inode = _inode; + + inode_init_once(&inode->vfs_inode); +} + +/* + * romfs module initialisation + */ +static int __init init_romfs_fs(void) +{ + int ret; + + printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + + romfs_inode_cachep = + kmem_cache_create("romfs_i", + sizeof(struct romfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + romfs_i_init_once); + + if (!romfs_inode_cachep) { + printk(KERN_ERR + "ROMFS error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = register_filesystem(&romfs_fs_type); + if (ret) { + printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + goto error_register; + } + return 0; + +error_register: + kmem_cache_destroy(romfs_inode_cachep); + return ret; +} + +/* + * romfs module removal + */ +static void __exit exit_romfs_fs(void) +{ + unregister_filesystem(&romfs_fs_type); + kmem_cache_destroy(romfs_inode_cachep); +} + +module_init(init_romfs_fs); +module_exit(exit_romfs_fs); + +MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ -- cgit v1.2.3-59-g8ed1b From f52fd5b7fd11e85fe9de15d5c5b5d574f9ff4cab Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Feb 2009 12:31:54 +0000 Subject: NOMMU: Fix the RomFS Kconfig to ensure at least one backing store is selected Fix the configuration of the RomFS to make sure that at least one backing store method is always selected. This is done by rendering it down to a choice item that selects between Block, MTD and both. This also works correctly in the case that CONFIG_MTD=m: MTD cannot be selected as a backing store unless CONFIG_ROMFS_FS is also 'm'. Signed-off-by: David Howells Signed-off-by: David Woodhouse --- fs/romfs/Kconfig | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 802c742f002c..ce2d6bcc6266 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -1,6 +1,6 @@ config ROMFS_FS tristate "ROM file system support" - depends on BLOCK + depends on BLOCK || MTD ---help--- This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for @@ -15,9 +15,19 @@ config ROMFS_FS If you don't know whether you need it, then you don't need it: answer N. -config ROMFS_ON_BLOCK - bool "Block device-backed ROM file system support" if (ROMFS_ON_MTD && EMBEDDED) - depends on ROMFS_FS && BLOCK +# +# Select the backing stores to be supported +# +choice + prompt "RomFS backing stores" + depends on ROMFS_FS + default ROMFS_BACKED_BY_BLOCK + help + Select the backing stores to be supported. + +config ROMFS_BACKED_BY_BLOCK + bool "Block device-backed ROM file system support" + depends on BLOCK help This permits ROMFS to use block devices buffered through the page cache as the medium from which to retrieve data. It does not allow @@ -25,9 +35,8 @@ config ROMFS_ON_BLOCK If unsure, answer Y. -config ROMFS_ON_MTD +config ROMFS_BACKED_BY_MTD bool "MTD-backed ROM file system support" - depends on ROMFS_FS depends on MTD=y || (ROMFS_FS=m && MTD) help This permits ROMFS to use MTD based devices directly, without the @@ -38,3 +47,16 @@ config ROMFS_ON_MTD If unsure, answer Y. +config ROMFS_BACKED_BY_BOTH + bool "Both the above" + depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) +endchoice + + +config ROMFS_ON_BLOCK + bool + default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + +config ROMFS_ON_MTD + bool + default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH -- cgit v1.2.3-59-g8ed1b From 158772c2d4178525365dd46b8223184a861df58f Mon Sep 17 00:00:00 2001 From: Bernd Schmidt Date: Thu, 12 Feb 2009 10:40:15 +0000 Subject: [MTD] Fix a bad dependency in the Blackfin code Fix a bad dependency in the Blackfin code on a RomFS config symbol that doesn't exist upstream. Signed-off-by: Bernd Schmidt Signed-off-by: David Howells Tested-by: Bernd Schmidt Signed-off-by: David Woodhouse --- arch/blackfin/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 33e2e8993f7f..7f7ce076184b 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -337,7 +337,7 @@ int _access_ok(unsigned long addr, unsigned long size) if (addr >= memory_mtd_end && (addr + size) <= physical_mem_end) return 1; -#ifdef CONFIG_ROMFS_MTD_FS +#ifdef CONFIG_ROMFS_ON_MTD /* For XIP, allow user space to use pointers within the ROMFS. */ if (addr >= memory_mtd_start && (addr + size) <= memory_mtd_end) return 1; -- cgit v1.2.3-59-g8ed1b From c0e6616ae69774f42fda7a8cc5dc699aff311b40 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 24 Mar 2009 18:27:24 +0900 Subject: [MTD] [NAND] sh_flctl: fix hardware ecc handling for 2048 byte page Signed-off-by: Jeremy Baker Signed-off-by: Yoshihiro Shimoda Signed-off-by: David Woodhouse --- drivers/mtd/nand/sh_flctl.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c index 821acb08ff1c..2bc896623e2d 100644 --- a/drivers/mtd/nand/sh_flctl.c +++ b/drivers/mtd/nand/sh_flctl.c @@ -58,7 +58,7 @@ static struct nand_bbt_descr flctl_4secc_smallpage = { }; static struct nand_bbt_descr flctl_4secc_largepage = { - .options = 0, + .options = NAND_BBT_SCAN2NDPAGE, .offs = 58, .len = 2, .pattern = scan_ff_pattern, @@ -149,7 +149,7 @@ static void wait_wfifo_ready(struct sh_flctl *flctl) printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n"); } -static int wait_recfifo_ready(struct sh_flctl *flctl) +static int wait_recfifo_ready(struct sh_flctl *flctl, int sector_number) { uint32_t timeout = LOOP_TIMEOUT_MAX; int checked[4]; @@ -183,7 +183,12 @@ static int wait_recfifo_ready(struct sh_flctl *flctl) uint8_t org; int index; - index = data >> 16; + if (flctl->page_size) + index = (512 * sector_number) + + (data >> 16); + else + index = data >> 16; + org = flctl->done_buff[index]; flctl->done_buff[index] = org ^ (data & 0xFF); checked[i] = 1; @@ -238,14 +243,14 @@ static void read_fiforeg(struct sh_flctl *flctl, int rlen, int offset) } } -static int read_ecfiforeg(struct sh_flctl *flctl, uint8_t *buff) +static int read_ecfiforeg(struct sh_flctl *flctl, uint8_t *buff, int sector) { int i; unsigned long *ecc_buf = (unsigned long *)buff; void *fifo_addr = (void *)FLECFIFO(flctl); for (i = 0; i < 4; i++) { - if (wait_recfifo_ready(flctl)) + if (wait_recfifo_ready(flctl , sector)) return 1; ecc_buf[i] = readl(fifo_addr); ecc_buf[i] = be32_to_cpu(ecc_buf[i]); @@ -384,7 +389,8 @@ static void execmd_read_page_sector(struct mtd_info *mtd, int page_addr) read_fiforeg(flctl, 512, 512 * sector); ret = read_ecfiforeg(flctl, - &flctl->done_buff[mtd->writesize + 16 * sector]); + &flctl->done_buff[mtd->writesize + 16 * sector], + sector); if (ret) flctl->hwecc_cant_correct[sector] = 1; -- cgit v1.2.3-59-g8ed1b From 58c610bd1a3f50820e45a7c09ec0e44d2cda15dd Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 18 Mar 2009 15:33:05 +0800 Subject: intel-iommu: Snooping control support Snooping control enabled IOMMU to guarantee DMA cache coherency and thus reduce software effort (VMM) in maintaining effective memory type. Signed-off-by: Sheng Yang Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 38 +++++++++++++++++++++++++++++++++----- include/linux/intel-iommu.h | 2 +- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f3f686581a90..be999ff025af 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -231,6 +231,7 @@ struct dmar_domain { int flags; /* flags to find out type of domain */ int iommu_coherency;/* indicate coherency of iommu access */ + int iommu_snooping; /* indicate snooping control feature*/ int iommu_count; /* reference count of iommu */ spinlock_t iommu_lock; /* protect iommu set in domain */ u64 max_addr; /* maximum mapped address */ @@ -421,7 +422,6 @@ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) return g_iommus[iommu_id]; } -/* "Coherency" capability may be different across iommus */ static void domain_update_iommu_coherency(struct dmar_domain *domain) { int i; @@ -438,6 +438,29 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain) } } +static void domain_update_iommu_snooping(struct dmar_domain *domain) +{ + int i; + + domain->iommu_snooping = 1; + + i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); + for (; i < g_num_of_iommus; ) { + if (!ecap_sc_support(g_iommus[i]->ecap)) { + domain->iommu_snooping = 0; + break; + } + i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1); + } +} + +/* Some capabilities may be different across iommus */ +static void domain_update_iommu_cap(struct dmar_domain *domain) +{ + domain_update_iommu_coherency(domain); + domain_update_iommu_snooping(domain); +} + static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn) { struct dmar_drhd_unit *drhd = NULL; @@ -1429,6 +1452,11 @@ static int domain_init(struct dmar_domain *domain, int guest_width) else domain->iommu_coherency = 0; + if (ecap_sc_support(iommu->ecap)) + domain->iommu_snooping = 1; + else + domain->iommu_snooping = 0; + domain->iommu_count = 1; /* always allocate the top pgd */ @@ -1557,7 +1585,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, spin_lock_irqsave(&domain->iommu_lock, flags); if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) { domain->iommu_count++; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); } spin_unlock_irqrestore(&domain->iommu_lock, flags); return 0; @@ -2820,7 +2848,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, spin_lock_irqsave(&domain->iommu_lock, tmp_flags); clear_bit(iommu->seq_id, &domain->iommu_bmp); domain->iommu_count--; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags); } @@ -2848,13 +2876,13 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) iommu_detach_dev(iommu, info->bus, info->devfn); /* clear this iommu in iommu_bmp, update iommu count - * and coherency + * and capabilities */ spin_lock_irqsave(&domain->iommu_lock, flags2); if (test_and_clear_bit(iommu->seq_id, &domain->iommu_bmp)) { domain->iommu_count--; - domain_update_iommu_coherency(domain); + domain_update_iommu_cap(domain); } spin_unlock_irqrestore(&domain->iommu_lock, flags2); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index d2e3cbfba14f..3ad894004938 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -123,7 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define ecap_eim_support(e) ((e >> 4) & 0x1) #define ecap_ir_support(e) ((e >> 3) & 0x1) #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) - +#define ecap_sc_support(e) ((e >> 7) & 0x1) /* Snooping Control */ /* IOTLB_REG */ #define DMA_TLB_FLUSH_GRANU_OFFSET 60 -- cgit v1.2.3-59-g8ed1b From dbb9fd8630e95b6155aff658a2b5f80e95ca2bc6 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 18 Mar 2009 15:33:06 +0800 Subject: iommu: Add domain_has_cap iommu_ops This iommu_op can tell if domain have a specific capability, like snooping control for Intel IOMMU, which can be used by other components of kernel to adjust the behaviour. Signed-off-by: Sheng Yang Signed-off-by: David Woodhouse --- arch/x86/kernel/amd_iommu.c | 7 +++++++ drivers/base/iommu.c | 7 +++++++ drivers/pci/intel-iommu.c | 12 ++++++++++++ include/linux/iommu.h | 12 ++++++++++++ 4 files changed, 38 insertions(+) diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 5113c080f0c4..65c9b58655ff 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1924,6 +1924,12 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, return paddr; } +static int amd_iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + return 0; +} + static struct iommu_ops amd_iommu_ops = { .domain_init = amd_iommu_domain_init, .domain_destroy = amd_iommu_domain_destroy, @@ -1932,5 +1938,6 @@ static struct iommu_ops amd_iommu_ops = { .map = amd_iommu_map_range, .unmap = amd_iommu_unmap_range, .iova_to_phys = amd_iommu_iova_to_phys, + .domain_has_cap = amd_iommu_domain_has_cap, }; diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c index 5e039d4f877c..c314f144825f 100644 --- a/drivers/base/iommu.c +++ b/drivers/base/iommu.c @@ -98,3 +98,10 @@ phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, return iommu_ops->iova_to_phys(domain, iova); } EXPORT_SYMBOL_GPL(iommu_iova_to_phys); + +int iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + return iommu_ops->domain_has_cap(domain, cap); +} +EXPORT_SYMBOL_GPL(iommu_domain_has_cap); diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index be999ff025af..3778ab149baf 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3158,6 +3158,17 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } +static int intel_iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + struct dmar_domain *dmar_domain = domain->priv; + + if (cap == IOMMU_CAP_CACHE_COHERENCY) + return dmar_domain->iommu_snooping; + + return 0; +} + static struct iommu_ops intel_iommu_ops = { .domain_init = intel_iommu_domain_init, .domain_destroy = intel_iommu_domain_destroy, @@ -3166,6 +3177,7 @@ static struct iommu_ops intel_iommu_ops = { .map = intel_iommu_map_range, .unmap = intel_iommu_unmap_range, .iova_to_phys = intel_iommu_iova_to_phys, + .domain_has_cap = intel_iommu_domain_has_cap, }; static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8a7bfb1b6ca0..0cf3a4e43f23 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -28,6 +28,8 @@ struct iommu_domain { void *priv; }; +#define IOMMU_CAP_CACHE_COHERENCY 0x1 + struct iommu_ops { int (*domain_init)(struct iommu_domain *domain); void (*domain_destroy)(struct iommu_domain *domain); @@ -39,6 +41,8 @@ struct iommu_ops { size_t size); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, unsigned long iova); + int (*domain_has_cap)(struct iommu_domain *domain, + unsigned long cap); }; #ifdef CONFIG_IOMMU_API @@ -57,6 +61,8 @@ extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova, size_t size); extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, unsigned long iova); +extern int iommu_domain_has_cap(struct iommu_domain *domain, + unsigned long cap); #else /* CONFIG_IOMMU_API */ @@ -107,6 +113,12 @@ static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, return 0; } +static inline int domain_has_cap(struct iommu_domain *domain, + unsigned long cap) +{ + return 0; +} + #endif /* CONFIG_IOMMU_API */ #endif /* __LINUX_IOMMU_H */ -- cgit v1.2.3-59-g8ed1b From 9cf0669746be19a4906a6c48920060bcf54c708b Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 18 Mar 2009 15:33:07 +0800 Subject: intel-iommu: VT-d page table to support snooping control bit The user can request to enable snooping control through VT-d page table. Signed-off-by: Sheng Yang Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 12 +++++++++++- include/linux/dma_remapping.h | 1 + include/linux/iommu.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 3778ab149baf..a0ba568b831c 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -164,7 +164,8 @@ static inline void context_clear_entry(struct context_entry *context) * 1: writable * 2-6: reserved * 7: super page - * 8-11: available + * 8-10: available + * 11: snoop behavior * 12-63: Host physcial address */ struct dma_pte { @@ -186,6 +187,11 @@ static inline void dma_set_pte_writable(struct dma_pte *pte) pte->val |= DMA_PTE_WRITE; } +static inline void dma_set_pte_snp(struct dma_pte *pte) +{ + pte->val |= DMA_PTE_SNP; +} + static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot) { pte->val = (pte->val & ~3) | (prot & 3); @@ -1685,6 +1691,8 @@ domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova, BUG_ON(dma_pte_addr(pte)); dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT); dma_set_pte_prot(pte, prot); + if (prot & DMA_PTE_SNP) + dma_set_pte_snp(pte); domain_flush_cache(domain, pte, sizeof(*pte)); start_pfn++; index++; @@ -3105,6 +3113,8 @@ static int intel_iommu_map_range(struct iommu_domain *domain, prot |= DMA_PTE_READ; if (iommu_prot & IOMMU_WRITE) prot |= DMA_PTE_WRITE; + if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) + prot |= DMA_PTE_SNP; max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size); if (dmar_domain->max_addr < max_addr) { diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index af1dab41674b..1a455f1f86d7 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -11,6 +11,7 @@ #define DMA_PTE_READ (1) #define DMA_PTE_WRITE (2) +#define DMA_PTE_SNP (1 << 11) struct intel_iommu; struct dmar_domain; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0cf3a4e43f23..3af4ffd591b9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -21,6 +21,7 @@ #define IOMMU_READ (1) #define IOMMU_WRITE (2) +#define IOMMU_CACHE (4) /* DMA cache coherency */ struct device; -- cgit v1.2.3-59-g8ed1b From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 23 Mar 2009 18:28:15 +0100 Subject: genirq: add threaded interrupt handler support Add support for threaded interrupt handlers: A device driver can request that its main interrupt handler runs in a thread. To achive this the device driver requests the interrupt with request_threaded_irq() and provides additionally to the handler a thread function. The handler function is called in hard interrupt context and needs to check whether the interrupt originated from the device. If the interrupt originated from the device then the handler can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is returned when no further action is required. IRQ_WAKE_THREAD causes the genirq code to invoke the threaded (main) handler. When IRQ_WAKE_THREAD is returned handler must have disabled the interrupt on the device level. This is mandatory for shared interrupt handlers, but we need to do it as well for obscure x86 hardware where disabling an interrupt on the IO_APIC level redirects the interrupt to the legacy PIC interrupt lines. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar --- include/linux/hardirq.h | 2 +- include/linux/interrupt.h | 37 ++++++++- include/linux/irq.h | 5 ++ include/linux/irqreturn.h | 2 + include/linux/sched.h | 5 ++ kernel/exit.c | 2 + kernel/irq/handle.c | 31 +++++++- kernel/irq/manage.c | 192 ++++++++++++++++++++++++++++++++++++++++++---- 8 files changed, 259 insertions(+), 17 deletions(-) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index f83288347dda..2dfaadbdb2ac 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -105,7 +105,7 @@ # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET #endif -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS) extern void synchronize_irq(unsigned int irq); #else # define synchronize_irq(irq) barrier() diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0c9cb63e6895..6fc2b720c231 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -59,6 +59,16 @@ #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, +}; + typedef irqreturn_t (*irq_handler_t)(int, void *); /** @@ -71,6 +81,9 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @dir: pointer to the proc/irq/NN/name entry + * @thread_fn: interupt handler function for threaded interrupts + * @thread: thread pointer for threaded interrupts + * @thread_flags: flags related to @thread */ struct irqaction { irq_handler_t handler; @@ -81,11 +94,31 @@ struct irqaction { struct irqaction *next; int irq; struct proc_dir_entry *dir; + irq_handler_t thread_fn; + struct task_struct *thread; + unsigned long thread_flags; }; extern irqreturn_t no_action(int cpl, void *dev_id); -extern int __must_check request_irq(unsigned int, irq_handler_t handler, - unsigned long, const char *, void *); + +extern int __must_check +request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long flags, const char *name, void *dev); + +static inline int __must_check +request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, + const char *name, void *dev) +{ + return request_threaded_irq(irq, handler, NULL, flags, name, dev); +} + +#ifdef CONFIG_GENERIC_HARDIRQS +extern void exit_irq_thread(void); +#else +static inline void exit_irq_thread(void) { } +#endif + extern void free_irq(unsigned int, void *); struct device; diff --git a/include/linux/irq.h b/include/linux/irq.h index 873e4ac11b81..8b1cf0630210 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -155,6 +156,8 @@ struct irq_2_iommu; * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing * @pending_mask: pending rebalanced interrupts + * @threads_active: number of irqaction threads currently running + * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers * @dir: /proc/irq/ procfs entry * @name: flow handler name for /proc/interrupts output */ @@ -186,6 +189,8 @@ struct irq_desc { cpumask_var_t pending_mask; #endif #endif + atomic_t threads_active; + wait_queue_head_t wait_for_threads; #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h index c5584ca5b8c9..819acaaac3f5 100644 --- a/include/linux/irqreturn.h +++ b/include/linux/irqreturn.h @@ -5,10 +5,12 @@ * enum irqreturn * @IRQ_NONE interrupt was not from this device * @IRQ_HANDLED interrupt was handled by this device + * @IRQ_WAKE_THREAD handler requests to wake the handler thread */ enum irqreturn { IRQ_NONE, IRQ_HANDLED, + IRQ_WAKE_THREAD, }; typedef enum irqreturn irqreturn_t; diff --git a/include/linux/sched.h b/include/linux/sched.h index 46d680643f89..38b77b0f56e5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1292,6 +1292,11 @@ struct task_struct { /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ spinlock_t alloc_lock; +#ifdef CONFIG_GENERIC_HARDIRQS + /* IRQ handler threads */ + struct irqaction *irqaction; +#endif + /* Protection of the PI data structures: */ spinlock_t pi_lock; diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c6..ca0b3488c4a9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code) schedule(); } + exit_irq_thread(); + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9ebf77968871..fe8f45374e86 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) do { ret = action->handler(irq, action->dev_id); - if (ret == IRQ_HANDLED) + + switch (ret) { + case IRQ_WAKE_THREAD: + /* + * Wake up the handler thread for this + * action. In case the thread crashed and was + * killed we just pretend that we handled the + * interrupt. The hardirq handler above has + * disabled the device interrupt, so no irq + * storm is lurking. + */ + if (likely(!test_bit(IRQTF_DIED, + &action->thread_flags))) { + set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + wake_up_process(action->thread); + } + + /* + * Set it to handled so the spurious check + * does not trigger. + */ + ret = IRQ_HANDLED; + /* Fall through to add to randomness */ + case IRQ_HANDLED: status |= action->flags; + break; + + default: + break; + } + retval |= ret; action = action->next; } while (action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6458e99984c0..a4c1ab86cd25 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -8,16 +8,15 @@ */ #include +#include #include #include #include #include +#include #include "internals.h" -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -cpumask_var_t irq_default_affinity; - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); + + /* + * We made sure that no hardirq handler is running. Now verify + * that no threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } EXPORT_SYMBOL(synchronize_irq); +#ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq) return 1; } +static void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +{ + struct irqaction *action = desc->action; + + while (action) { + if (action->thread) + set_cpus_allowed_ptr(action->thread, cpumask); + action = action->next; + } +} + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity @@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif + irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); + if (!ret) + irq_set_thread_affinity(desc, desc->affinity); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +static inline int irq_thread_should_run(struct irqaction *action) +{ + return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); +} + +static int irq_wait_for_interrupt(struct irqaction *action) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (irq_thread_should_run(action)) { + __set_current_state(TASK_RUNNING); + return 0; + } else + schedule(); + } + return -1; +} + +/* + * Interrupt handler thread + */ +static int irq_thread(void *data) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + struct irqaction *action = data; + struct irq_desc *desc = irq_to_desc(action->irq); + int wake; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->irqaction = action; + + while (!irq_wait_for_interrupt(action)) { + + atomic_inc(&desc->threads_active); + + spin_lock_irq(&desc->lock); + if (unlikely(desc->status & IRQ_DISABLED)) { + /* + * CHECKME: We might need a dedicated + * IRQ_THREAD_PENDING flag here, which + * retriggers the thread in check_irq_resend() + * but AFAICT IRQ_PENDING should be fine as it + * retriggers the interrupt itself --- tglx + */ + desc->status |= IRQ_PENDING; + spin_unlock_irq(&desc->lock); + } else { + spin_unlock_irq(&desc->lock); + + action->thread_fn(action->irq, action->dev_id); + } + + wake = atomic_dec_and_test(&desc->threads_active); + + if (wake && waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); + } + + /* + * Clear irqaction. Otherwise exit_irq_thread() would make + * fuzz about an active irq thread going into nirvana. + */ + current->irqaction = NULL; + return 0; +} + +/* + * Called from do_exit() + */ +void exit_irq_thread(void) +{ + struct task_struct *tsk = current; + + if (!tsk->irqaction) + return; + + printk(KERN_ERR + "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + + /* + * Set the THREAD DIED flag to prevent further wakeups of the + * soon to be gone threaded handler. + */ + set_bit(IRQTF_DIED, &tsk->irqaction->flags); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -419,6 +529,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } + /* + * Threaded handler ? + */ + if (new->thread_fn) { + struct task_struct *t; + + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + if (IS_ERR(t)) + return PTR_ERR(t); + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + wake_up_process(t); + } + /* * The following block of code has to be executed atomically */ @@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); + init_waitqueue_head(&desc->wait_for_threads); + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, irq, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - spin_unlock_irqrestore(&desc->lock, flags); - return ret; - } + if (ret) + goto out_thread; } else compat_irq_chip_set_default_handler(desc); #if defined(CONFIG_IRQ_PER_CPU) @@ -532,8 +662,19 @@ mismatch: dump_stack(); } #endif + ret = -EBUSY; + +out_thread: spin_unlock_irqrestore(&desc->lock, flags); - return -EBUSY; + if (new->thread) { + struct task_struct *t = new->thread; + + new->thread = NULL; + if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) + kthread_stop(t); + put_task_struct(t); + } + return ret; } /** @@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; + struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) else desc->chip->disable(irq); } + + irqthread = action->thread; + action->thread = NULL; + spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); + if (irqthread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(irqthread); + put_task_struct(irqthread); + } + #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id) EXPORT_SYMBOL(free_irq); /** - * request_irq - allocate an interrupt line + * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq); * raises, you must take care both to initialise your hardware * and to set up the interrupt handler in the right order. * + * If you want to set up a threaded irq handler for your device + * then you need to supply @handler and @thread_fn. @handler ist + * still called in hard interrupt context and has to check + * whether the interrupt originates from the device. If yes it + * needs to disable the interrupt on the device and return + * IRQ_THREAD_WAKE which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support + * shared interrupts. + * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. @@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq); * IRQF_TRIGGER_* Specify active edge(s) or level * */ -int request_irq(unsigned int irq, irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) +int request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, unsigned long irqflags, + const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -ENOMEM; action->handler = handler; + action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; @@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler, #endif return retval; } -EXPORT_SYMBOL(request_irq); +EXPORT_SYMBOL(request_threaded_irq); -- cgit v1.2.3-59-g8ed1b From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 23 Mar 2009 18:28:16 +0100 Subject: genirq: add support for threaded interrupts to devres Some devices use devres_request_irq() for to install their interrupt handler. Add support for threaded interrupts to devres as well. [tglx - simplified and adapted to latest threadirq version] Signed-off-by: Arjan van de Ven Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 17 ++++++++++++++--- kernel/irq/devres.c | 16 ++++++++++------ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6fc2b720c231..dbf6a6fd116c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -123,9 +123,20 @@ extern void free_irq(unsigned int, void *); struct device; -extern int __must_check devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id); +extern int __must_check +devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id); + +static inline int __must_check +devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +{ + return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags, + devname, dev_id); +} + extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); /* diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 38a25b8d8bff..d06df9c41cba 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data) } /** - * devm_request_irq - allocate an interrupt line for a managed device + * devm_request_threaded_irq - allocate an interrupt line for a managed device * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * If an IRQ allocated with this function needs to be freed * separately, dev_free_irq() must be used. */ -int devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) { struct irq_devres *dr; int rc; @@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq, if (!dr) return -ENOMEM; - rc = request_irq(irq, handler, irqflags, devname, dev_id); + rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, + dev_id); if (rc) { devres_free(dr); return rc; @@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_irq); +EXPORT_SYMBOL(devm_request_threaded_irq); /** * devm_free_irq - free an interrupt -- cgit v1.2.3-59-g8ed1b From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Mar 2009 11:46:22 +0100 Subject: genirq: threaded irq handlers review fixups Delta patch to address the review comments. - Implement warning when IRQ_WAKE_THREAD is requested and no thread handler installed - coding style fixes Pointed-out-by: Christoph Hellwig Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 2 ++ kernel/irq/handle.c | 29 ++++++++++++++++++++++++----- kernel/irq/manage.c | 17 +++++++---------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index dbf6a6fd116c..266a5f5f57cc 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -63,10 +63,12 @@ * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed */ enum { IRQTF_RUNTHREAD, IRQTF_DIED, + IRQTF_WARNED, }; typedef irqreturn_t (*irq_handler_t)(int, void *); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fe8f45374e86..38b49a9e508a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id) return IRQ_NONE; } +static void warn_no_thread(unsigned int irq, struct irqaction *action) +{ + if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) + return; + + printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " + "but no thread function available.", irq, action->name); +} + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -360,6 +369,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) switch (ret) { case IRQ_WAKE_THREAD: + /* + * Set result to handled so the spurious check + * does not trigger. + */ + ret = IRQ_HANDLED; + + /* + * Catch drivers which return WAKE_THREAD but + * did not set up a thread function + */ + if (unlikely(!action->thread_fn)) { + warn_no_thread(irq, action); + break; + } + /* * Wake up the handler thread for this * action. In case the thread crashed and was @@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) wake_up_process(action->thread); } - /* - * Set it to handled so the spurious check - * does not trigger. - */ - ret = IRQ_HANDLED; /* Fall through to add to randomness */ case IRQ_HANDLED: status |= action->flags; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4c1ab86cd25..a3eb7baf1e46 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } -static inline int irq_thread_should_run(struct irqaction *action) -{ - return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags); -} - static int irq_wait_for_interrupt(struct irqaction *action) { while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); - if (irq_thread_should_run(action)) { + + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { __set_current_state(TASK_RUNNING); return 0; - } else - schedule(); + } + schedule(); } return -1; } @@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq); * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs. * Primary handler for threaded interrupts - * @thread_fn: Function called from the irq handler thread - * If NULL, no irq thread is created + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function -- cgit v1.2.3-59-g8ed1b From e4955c9986a27bb47ddeb6cd55803053f547e2e9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:04:37 +0800 Subject: blktrace: mark ddir_act[] const Impact: cleanup ddir_act and what2act always stay immutable. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C89415.5080503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 108f4f7715a5..1ffcbd4ac45b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,8 +147,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, /* * Data direction bit lookup */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), - BLK_TC_ACT(BLK_TC_WRITE) }; +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; /* The ilog2() calls fall out because they're constant */ #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ @@ -1116,10 +1116,10 @@ static void blk_tracer_reset(struct trace_array *tr) blk_tracer_stop(tr); } -static struct { +static const struct { const char *act[2]; int (*print)(struct trace_seq *s, const struct trace_entry *ent); -} what2act[] __read_mostly = { +} what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, -- cgit v1.2.3-59-g8ed1b From 65796348e09880e12b97267d39b8857c758440a6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:05:06 +0800 Subject: blktrace: fix wrong calculation of RWBS Impact: fix the output of IO type category characters Trace categories are the upper 16 bits, not the lower 16 bits. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C89432.8010805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1ffcbd4ac45b..9af41430ee54 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -922,23 +922,24 @@ static void blk_unregister_tracepoints(void) static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) { int i = 0; + int tc = t->action >> BLK_TC_SHIFT; - if (t->action & BLK_TC_DISCARD) + if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (t->action & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (t->action & BLK_TC_AHEAD) + if (tc & BLK_TC_AHEAD) rwbs[i++] = 'A'; - if (t->action & BLK_TC_BARRIER) + if (tc & BLK_TC_BARRIER) rwbs[i++] = 'B'; - if (t->action & BLK_TC_SYNC) + if (tc & BLK_TC_SYNC) rwbs[i++] = 'S'; - if (t->action & BLK_TC_META) + if (tc & BLK_TC_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; -- cgit v1.2.3-59-g8ed1b From e0dc81bec0927fa0c8aabc521793161909eef7a5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 16:05:51 +0800 Subject: blktrace: fix t_error() Impact: fix error flag output t_error() should return t->error but not t->sector. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C8945F.5020802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9af41430ee54..f69f8bd8bef9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -968,7 +968,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent) static inline __u16 t_error(const struct trace_entry *ent) { - return te_blk_io_trace(ent)->sector; + return te_blk_io_trace(ent)->error; } static __u64 get_pdu_int(const struct trace_entry *ent) -- cgit v1.2.3-59-g8ed1b From 093419971e03362a00f499960557c119982ea09f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 24 Mar 2009 17:43:30 +0800 Subject: blktrace: print human-readable act_mask Impact: new feature, allow symbolic values in /debug/tracing/act_mask Print stringified act_mask instead of hex value: # cat act_mask read,write,barrier,sync,queue,requeue,issue,complete,fs,pc,ahead,meta, discard,drv_data # echo "meta,write" > act_mask # cat act_mask write,meta Also: - make act_mask accept "ahead", "meta", "discard" and "drv_data" - use strsep() instead of strchr() to parse user input - return -EINVAL if a token is not found in the mask map - fix a bug that 'value' is unsigned, so it can < 0 - propagate error value of blk_trace_mask2str() to userspace, but not always return -ENXIO. Signed-off-by: Li Zefan Acked-by: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo LKML-Reference: <49C8AB42.1000802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 103 ++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f69f8bd8bef9..6fb274f5f34e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1316,53 +1316,77 @@ struct attribute_group blk_trace_attr_group = { .attrs = blk_trace_attrs, }; -static int blk_str2act_mask(const char *str) +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, +}; + +static int blk_trace_str2mask(const char *str) { + int i; int mask = 0; - char *copy = kstrdup(str, GFP_KERNEL), *s; + char *s, *token; - if (copy == NULL) + s = kstrdup(str, GFP_KERNEL); + if (s == NULL) return -ENOMEM; - - s = strstrip(copy); + s = strstrip(s); while (1) { - char *sep = strchr(s, ','); - - if (sep != NULL) - *sep = '\0'; - - if (strcasecmp(s, "barrier") == 0) - mask |= BLK_TC_BARRIER; - else if (strcasecmp(s, "complete") == 0) - mask |= BLK_TC_COMPLETE; - else if (strcasecmp(s, "fs") == 0) - mask |= BLK_TC_FS; - else if (strcasecmp(s, "issue") == 0) - mask |= BLK_TC_ISSUE; - else if (strcasecmp(s, "pc") == 0) - mask |= BLK_TC_PC; - else if (strcasecmp(s, "queue") == 0) - mask |= BLK_TC_QUEUE; - else if (strcasecmp(s, "read") == 0) - mask |= BLK_TC_READ; - else if (strcasecmp(s, "requeue") == 0) - mask |= BLK_TC_REQUEUE; - else if (strcasecmp(s, "sync") == 0) - mask |= BLK_TC_SYNC; - else if (strcasecmp(s, "write") == 0) - mask |= BLK_TC_WRITE; - - if (sep == NULL) + token = strsep(&s, ","); + if (token == NULL) break; - s = sep + 1; + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } } - kfree(copy); + kfree(s); return mask; } +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + static struct request_queue *blk_trace_get_queue(struct block_device *bdev) { if (bdev->bd_disk == NULL) @@ -1399,7 +1423,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q->blk_trace == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); else if (attr == &dev_attr_pid) ret = sprintf(buf, "%u\n", q->blk_trace->pid); else if (attr == &dev_attr_start_lba) @@ -1424,7 +1448,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct request_queue *q; struct hd_struct *p; u64 value; - ssize_t ret = -ENXIO; + ssize_t ret = -EINVAL; if (count == 0) goto out; @@ -1432,13 +1456,16 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_act_mask) { if (sscanf(buf, "%llx", &value) != 1) { /* Assume it is a list of trace category names */ - value = blk_str2act_mask(buf); - if (value < 0) + ret = blk_trace_str2mask(buf); + if (ret < 0) goto out; + value = ret; } } else if (sscanf(buf, "%llu", &value) != 1) goto out; + ret = -ENXIO; + lock_kernel(); p = dev_to_part(dev); bdev = bdget(part_devt(p)); -- cgit v1.2.3-59-g8ed1b From 098335215a4921a8a54193829eaed602dca24df5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 21 Mar 2009 02:44:50 -0400 Subject: tracing: fix memory leak in trace_stat If the function profiler does not have any items recorded and one were to cat the function stat file, the kernel would take a BUG with a NULL pointer dereference. Looking further into this, I found that returning NULL from stat_start did not stop the stat logic, and would later call stat_next. This breaks from the way seq_file works, so I looked into fixing the stat code. This is where I noticed that the last next_entry is never freed. It is allocated, and if the stat_next returns NULL, the code breaks out of the loop, unlocks the mutex and exits. We never link the next_entry nor do we free it. Thus it is a real memory leak. This patch rearranges the code a bit to not only fix the memory leak, but also to act more like seq_file where nothing is printed if there is nothing to print. That is, stat_start returns NULL. Signed-off-by: Steven Rostedt --- kernel/trace/trace_stat.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 39310e3434ee..f71b85b22cfe 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; - void *prev_stat; + void *stat; int ret = 0; int i; @@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; + stat = ts->stat_start(); + if (!stat) + goto exit; + /* * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. @@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session) list_add(&new_entry->list, &session->stat_list); - new_entry->stat = ts->stat_start(); - prev_stat = new_entry->stat; + new_entry->stat = stat; /* * Iterate over the tracer stat entries and store them in a sorted * list. */ for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; @@ -114,11 +123,7 @@ static int stat_seq_init(struct tracer_stat_session *session) } INIT_LIST_HEAD(&new_entry->list); - new_entry->stat = ts->stat_next(prev_stat, i); - - /* End of insertion */ - if (!new_entry->stat) - break; + new_entry->stat = stat; list_for_each_entry(iter_entry, &session->stat_list, list) { @@ -137,8 +142,6 @@ static int stat_seq_init(struct tracer_stat_session *session) break; } } - - prev_stat = new_entry->stat; } exit: mutex_unlock(&session->stat_mutex); -- cgit v1.2.3-59-g8ed1b From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:38:49 -0400 Subject: function-graph: moved the timestamp from arch to generic code This patch move the timestamp from happening in the arch specific code into the general code. This allows for better control by the tracer to time manipulation. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 6 +----- include/linux/ftrace.h | 3 +-- kernel/trace/trace_functions_graph.c | 8 +++++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 57b33edb7ce3..61df77532120 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void) void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) { unsigned long old; - unsigned long long calltime; int faulted; struct ftrace_graph_ent trace; unsigned long return_hooker = (unsigned long) @@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = trace_clock_local(); - - if (ftrace_push_return_trace(old, calltime, - self_addr, &trace.depth) == -EBUSY) { + if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) { *parent = old; return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db3fed630db3..1141248c84ee 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -369,8 +369,7 @@ struct ftrace_ret_stack { extern void return_to_handler(void); extern int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth); +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); extern void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e876816fa8e7..d28687e7b3a7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = { /* Add a function return address to the trace stack on thread info.*/ int -ftrace_push_return_trace(unsigned long ret, unsigned long long time, - unsigned long func, int *depth) +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) { + unsigned long long calltime; int index; if (!current->ret_stack) @@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time, return -EBUSY; } + calltime = trace_clock_local(); + index = ++current->curr_ret_stack; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; - current->ret_stack[index].calltime = time; + current->ret_stack[index].calltime = calltime; *depth = index; return 0; -- cgit v1.2.3-59-g8ed1b From 05ce5818adee8f8efd0a5ca0d900a6789012516b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 00:18:31 -0400 Subject: function-graph: prevent more than one tracer registering Impact: prevent crash due to multiple function graph tracers The function graph tracer can currently only handle a single tracer being registered. If another tracer registers with the function graph tracer it can crash the system. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7847806eefef..c81a759fbf76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2643,6 +2643,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); + /* we currently allow only one tracer registered at a time */ + if (atomic_read(&ftrace_graph_active)) { + ret = -EBUSY; + goto out; + } + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); -- cgit v1.2.3-59-g8ed1b From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 01:10:15 -0400 Subject: function-graph: ignore times across schedule Impact: more accurate timings The current method of function graph tracing does not take into account the time spent when a task is not running. This shows functions that call schedule have increased costs: 3) + 18.664 us | } ------------------------------------------ 3) -0 => kblockd-123 ------------------------------------------ 3) | finish_task_switch() { 3) 1.441 us | _spin_unlock_irq(); 3) 3.966 us | } 3) ! 2959.433 us | } 3) ! 2961.465 us | } This patch uses the tracepoint in the scheduling context switch to account for time that has elapsed while a task is scheduled out. Now we see: ------------------------------------------ 3) -0 => edac-po-1067 ------------------------------------------ 3) | finish_task_switch() { 3) 0.685 us | _spin_unlock_irq(); 3) 2.331 us | } 3) + 41.439 us | } 3) + 42.663 us | } Signed-off-by: Steven Rostedt --- include/linux/sched.h | 2 ++ kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 89cd308cc7a5..471e36d30123 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1409,6 +1409,8 @@ struct task_struct { int curr_ret_stack; /* Stack of return addresses for return function tracing */ struct ftrace_ret_stack *ret_stack; + /* time stamp for last schedule */ + unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c81a759fbf76..0b90364d1a2c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,6 +29,8 @@ #include #include +#include + #include #include "trace.h" @@ -2590,6 +2592,31 @@ free: return ret; } +static void +ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, + struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + /* Allocate a return stack for each task */ static int start_graph_tracing(void) { @@ -2611,6 +2638,13 @@ static int start_graph_tracing(void) ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + kfree(ret_stack_list); return ret; } @@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void) mutex_lock(&ftrace_lock); atomic_dec(&ftrace_graph_active); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); @@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t) t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; } else t->ret_stack = NULL; } -- cgit v1.2.3-59-g8ed1b From 4b08e149c0e02e97ec49c2a31d14a0d3a02f8074 Mon Sep 17 00:00:00 2001 From: Benjamin Krill Date: Fri, 23 Jan 2009 17:18:05 +0100 Subject: [MTD] ofpart: Check name property to determine partition nodes. SLOF has a further node which could not be evaluated by the current routine. The current routine returns because the node hasn't the required reg property. As fix this patch adds a check to determine the partition child nodes. If the node is not a partition the number of total partitions will be decreased and loop continues with the next nodes. Signed-off-by: Benjamin Krill Signed-off-by: David Woodhouse --- drivers/mtd/ofpart.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c index 9e45b3f39c0e..3e164f0c9295 100644 --- a/drivers/mtd/ofpart.c +++ b/drivers/mtd/ofpart.c @@ -46,6 +46,13 @@ int __devinit of_mtd_parse_partitions(struct device *dev, const u32 *reg; int len; + /* check if this is a partition node */ + partname = of_get_property(pp, "name", &len); + if (strcmp(partname, "partition") != 0) { + nr_parts--; + continue; + } + reg = of_get_property(pp, "reg", &len); if (!reg || (len != 2 * sizeof(u32))) { of_node_put(pp); -- cgit v1.2.3-59-g8ed1b From be6f164a02f394675e2ac2077dd354cebef5b4c0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 11:06:24 -0400 Subject: function-graph: add option for include sleep times Impact: give user a choice to show times spent while sleeping The user may want to see the time a function spent sleeping. This patch adds the trace option "sleep-time" to allow that. The "sleep-time" option is default on. echo sleep-time > /debug/tracing/trace_options produces: ------------------------------------------ 2) avahi-d-3428 => -0 ------------------------------------------ 2) | finish_task_switch() { 2) 0.621 us | _spin_unlock_irq(); 2) 2.202 us | } 2) ! 1002.197 us | } 2) ! 1003.521 us | } where as, echo nosleep-time > /debug/tracing/trace_options produces: 0) -0 => yum-upd-3416 ------------------------------------------ 0) | finish_task_switch() { 0) 0.643 us | _spin_unlock_irq(); 0) 2.342 us | } 0) + 41.302 us | } 0) + 42.453 us | } Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 7 +++++++ kernel/trace/trace.c | 3 ++- kernel/trace/trace.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0b90364d1a2c..02d2de9d08ba 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2599,6 +2599,13 @@ ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, unsigned long long timestamp; int index; + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (trace_flags & TRACE_ITER_SLEEP_TIME) + return; + timestamp = trace_clock_local(); prev->ftrace_timestamp = timestamp; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f0e1337b1ebd..67c6a21dd427 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -316,6 +316,7 @@ static const char *trace_options[] = { "context-info", "latency-format", "global-clock", + "sleep-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7cfb741be200..d7410bbb9a80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -683,6 +683,7 @@ enum trace_iterator_flags { TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, + TRACE_ITER_SLEEP_TIME = 0x100000, }; /* -- cgit v1.2.3-59-g8ed1b From cc59c9e8d0165c632fd056c4a23e36f917507fb4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2009 11:03:01 +0800 Subject: ftrace: show virtual PID Impact: fix PID output under namespaces When current namespace is not the global namespace, pid read from set_ftrace_pid is no correct. # ~/newpid_namespace_run bash # echo $$ 1 # echo 1 > set_ftrace_pid # cat set_ftrace_pid 3756 Since we write virtual PID to set_ftrace_pid, we need get virtual PID when we read it. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49C84D65.9050606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02d2de9d08ba..bb377112b1bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2264,7 +2264,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf, if (ftrace_pid_trace == ftrace_swapper_pid) r = sprintf(buf, "swapper tasks\n"); else if (ftrace_pid_trace) - r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); + r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace)); else r = sprintf(buf, "no pid\n"); -- cgit v1.2.3-59-g8ed1b From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 24 Mar 2009 13:38:06 +0800 Subject: tracing: use union for multi-usages field Impact: cleanup struct dyn_ftrace::ip has different usages in his lifecycle, we use union for it. And also for struct dyn_ftrace::flags. Signed-off-by: Lai Jiangshan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49C871BE.3080405@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace.h | 12 +++++++++--- kernel/trace/ftrace.c | 8 ++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1141248c84ee..015a3d22cf74 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -145,9 +145,15 @@ enum { }; struct dyn_ftrace { - unsigned long ip; /* address of mcount call-site */ - unsigned long flags; - struct dyn_arch_ftrace arch; + union { + unsigned long ip; /* address of mcount call-site */ + struct dyn_ftrace *freelist; + }; + union { + unsigned long flags; + struct dyn_ftrace *newlist; + }; + struct dyn_arch_ftrace arch; }; int ftrace_force_update(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index bb377112b1bb..7b8722baf153 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec) static void ftrace_free_rec(struct dyn_ftrace *rec) { - rec->ip = (unsigned long)ftrace_free_records; + rec->freelist = ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } @@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) return NULL; } - ftrace_free_records = (void *)rec->ip; + ftrace_free_records = rec->freelist; memset(rec, 0, sizeof(*rec)); return rec; } @@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip) return NULL; rec->ip = ip; - rec->flags = (unsigned long)ftrace_new_addrs; + rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; return rec; @@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod) return -1; p = ftrace_new_addrs; - ftrace_new_addrs = (struct dyn_ftrace *)p->flags; + ftrace_new_addrs = p->newlist; p->flags = 0L; /* convert record (i.e, patch mcount-call with NOP) */ -- cgit v1.2.3-59-g8ed1b From 3a38148f0488069cadb75c4a6909954072d648bf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Mar 2009 20:27:39 +0100 Subject: genirq: provide old request_irq() for CONFIG_GENERIC_HARDIRQ=n Impact: Undo compile breakage for archs with CONFIG_GENERIC_HARDIRQ=n The threaded interrupt handler patches changed request_irq from extern to inline. Architectures which do not use the generic irq code still have request_irq() as a global function and therefor fail to compile. Keep the extern declaration for CONFIG_GENERIC_HARDIRQ=n Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 266a5f5f57cc..7e63b824833f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -103,6 +103,7 @@ struct irqaction { extern irqreturn_t no_action(int cpl, void *dev_id); +#ifdef CONFIG_GENERIC_HARDIRQS extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, @@ -115,9 +116,13 @@ request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, return request_threaded_irq(irq, handler, NULL, flags, name, dev); } -#ifdef CONFIG_GENERIC_HARDIRQS extern void exit_irq_thread(void); #else + +extern int __must_check +request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, + const char *name, void *dev); + static inline void exit_irq_thread(void) { } #endif -- cgit v1.2.3-59-g8ed1b From 89c8aa203e71f05afdc978700cbc767d310e3d70 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 2 Feb 2009 21:08:30 +0100 Subject: mmc: struct device - replace bus_id with dev_name(), dev_set_name() Acked-by: Greg Kroah-Hartman Signed-off-by: Kay Sievers Signed-off-by: Pierre Ossman --- drivers/mmc/host/atmel-mci.c | 2 +- drivers/mmc/host/of_mmc_spi.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 2b1196e6142c..e94e92001e7c 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -1603,7 +1603,7 @@ static int __init atmci_probe(struct platform_device *pdev) tasklet_init(&host->tasklet, atmci_tasklet_func, (unsigned long)host); - ret = request_irq(irq, atmci_interrupt, 0, pdev->dev.bus_id, host); + ret = request_irq(irq, atmci_interrupt, 0, dev_name(&pdev->dev), host); if (ret) goto err_request_irq; diff --git a/drivers/mmc/host/of_mmc_spi.c b/drivers/mmc/host/of_mmc_spi.c index fb2921f8099d..0c44d560bf1a 100644 --- a/drivers/mmc/host/of_mmc_spi.c +++ b/drivers/mmc/host/of_mmc_spi.c @@ -103,7 +103,7 @@ struct mmc_spi_platform_data *mmc_spi_get_pdata(struct spi_device *spi) if (!gpio_is_valid(oms->gpios[i])) continue; - ret = gpio_request(oms->gpios[i], dev->bus_id); + ret = gpio_request(oms->gpios[i], dev_name(dev)); if (ret < 0) { oms->gpios[i] = -EINVAL; continue; -- cgit v1.2.3-59-g8ed1b From 7de427d088a967d2173739e21e744921d5496a8b Mon Sep 17 00:00:00 2001 From: Jorg Schummer Date: Thu, 19 Feb 2009 13:17:03 +0200 Subject: mmc: delayed_work was never cancelled The delayed work item mmc_host.detect is now cancelled before flushing the work queue. This takes care of cases when delayed_work was scheduled for mmc_host.detect, but not yet placed in the work queue. Signed-off-by: Jorg Schummer Signed-off-by: Pierre Ossman --- drivers/mmc/core/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index df6ce4a06cf3..b5899e33b687 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -815,6 +815,7 @@ void mmc_stop_host(struct mmc_host *host) spin_unlock_irqrestore(&host->lock, flags); #endif + cancel_delayed_work(&host->detect); mmc_flush_scheduled_work(); mmc_bus_get(host); @@ -842,6 +843,7 @@ void mmc_stop_host(struct mmc_host *host) */ int mmc_suspend_host(struct mmc_host *host, pm_message_t state) { + cancel_delayed_work(&host->detect); mmc_flush_scheduled_work(); mmc_bus_get(host); -- cgit v1.2.3-59-g8ed1b From 6b0b62853b2553be375033776902640320970846 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 23 Feb 2009 12:38:41 +0000 Subject: mmc: add MODALIAS linkage for MMC/SD devices Currently we are using an explicit udev rule to trigger loading of the mmc-block module when an MMC or SD card is detected: SUBSYSTEM=="mmc", RUN+="/sbin/modprobe -Qba mmc-block" It makes much more sense for the mmc bus driver and the mmc-block module to share MODALIAS information so that they are linked automatically. There is no real information of use in the MMC system at the current time. All devices inserted require us to load the mmc-block device. Until such time as useful parameters exist simply reflect the module linkage via the module alias below: mmc:block Signed-off-by: Andy Whitcroft Signed-off-by: Pierre Ossman --- drivers/mmc/card/block.c | 2 ++ drivers/mmc/core/bus.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 513eb09a638f..fe8041e619ea 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -41,6 +41,8 @@ #include "queue.h" +MODULE_ALIAS("mmc:block"); + /* * max 8 partitions per card */ diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index f210a8ee6861..bdb165f93046 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -84,6 +84,14 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) } retval = add_uevent_var(env, "MMC_NAME=%s", mmc_card_name(card)); + if (retval) + return retval; + + /* + * Request the mmc_block device. Note: that this is a direct request + * for the module it carries no information as to what is inserted. + */ + retval = add_uevent_var(env, "MODALIAS=mmc:block"); return retval; } -- cgit v1.2.3-59-g8ed1b From 736bb6bb01a2a180b6f062e792bd03658d57ab7e Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 11 Feb 2009 14:52:20 +0200 Subject: mmc: Add Extended CSD register to debugfs Extended CSD is a MMC card register. As increasingly interesting fields are being added to Extended CSD, it is helpful to see its value. Note that SD cards do not have an Extended CSD register, so it is MMC only. Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/core/debugfs.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c index 1237bb4c722b..610dbd1fcc82 100644 --- a/drivers/mmc/core/debugfs.c +++ b/drivers/mmc/core/debugfs.c @@ -184,6 +184,68 @@ static int mmc_dbg_card_status_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(mmc_dbg_card_status_fops, mmc_dbg_card_status_get, NULL, "%08llx\n"); +#define EXT_CSD_STR_LEN 1025 + +static int mmc_ext_csd_open(struct inode *inode, struct file *filp) +{ + struct mmc_card *card = inode->i_private; + char *buf; + ssize_t n = 0; + u8 *ext_csd; + int err, i; + + buf = kmalloc(EXT_CSD_STR_LEN + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + ext_csd = kmalloc(512, GFP_KERNEL); + if (!ext_csd) { + err = -ENOMEM; + goto out_free; + } + + mmc_claim_host(card->host); + err = mmc_send_ext_csd(card, ext_csd); + mmc_release_host(card->host); + if (err) + goto out_free; + + for (i = 511; i >= 0; i--) + n += sprintf(buf + n, "%02x", ext_csd[i]); + n += sprintf(buf + n, "\n"); + BUG_ON(n != EXT_CSD_STR_LEN); + + filp->private_data = buf; + kfree(ext_csd); + return 0; + +out_free: + kfree(buf); + kfree(ext_csd); + return err; +} + +static ssize_t mmc_ext_csd_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf = filp->private_data; + + return simple_read_from_buffer(ubuf, cnt, ppos, + buf, EXT_CSD_STR_LEN); +} + +static int mmc_ext_csd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static struct file_operations mmc_dbg_ext_csd_fops = { + .open = mmc_ext_csd_open, + .read = mmc_ext_csd_read, + .release = mmc_ext_csd_release, +}; + void mmc_add_card_debugfs(struct mmc_card *card) { struct mmc_host *host = card->host; @@ -211,6 +273,11 @@ void mmc_add_card_debugfs(struct mmc_card *card) &mmc_dbg_card_status_fops)) goto err; + if (mmc_card_mmc(card)) + if (!debugfs_create_file("ext_csd", S_IRUSR, root, card, + &mmc_dbg_ext_csd_fops)) + goto err; + return; err: -- cgit v1.2.3-59-g8ed1b From 9e57d60829f5c00ebf44df65a4b709359e285c20 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 24 Feb 2009 14:48:16 +0200 Subject: omap_hsmmc: do not re-power when powering off MMC Remove code that turns MMC1 power back on after it has been powered off (when the voltage is 1.8V). The offending code is not necessary because the host controller bus voltage is initialized to 3V when probing or resuming. Note that MMC powers up with the highest voltage available (see mmc_power_up()) which will be 3V also. Signed-off-by: Adrian Hunter Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index a631c81dce12..303a7970806d 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -485,9 +485,6 @@ static int omap_mmc_switch_opcond(struct mmc_omap_host *host, int vdd) u32 reg_val = 0; int ret; - if (host->id != OMAP_MMC1_DEVID) - return 0; - /* Disable the clocks */ clk_disable(host->fclk); clk_disable(host->iclk); @@ -786,20 +783,6 @@ static void omap_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) switch (ios->power_mode) { case MMC_POWER_OFF: mmc_slot(host).set_power(host->dev, host->slot_id, 0, 0); - /* - * Reset interface voltage to 3V if it's 1.8V now; - * only relevant on MMC-1, the others always use 1.8V. - * - * REVISIT: If we are able to detect cards after unplugging - * a 1.8V card, this code should not be needed. - */ - if (host->id != OMAP_MMC1_DEVID) - break; - if (!(OMAP_HSMMC_READ(host->base, HCTL) & SDVSDET)) { - int vdd = fls(host->mmc->ocr_avail) - 1; - if (omap_mmc_switch_opcond(host, vdd) != 0) - host->mmc->ios.vdd = vdd; - } break; case MMC_POWER_UP: mmc_slot(host).set_power(host->dev, host->slot_id, 1, ios->vdd); -- cgit v1.2.3-59-g8ed1b From 1b331e69a2313f6e857890c7c2c40e3e2a74367a Mon Sep 17 00:00:00 2001 From: Kim Kyuwon Date: Fri, 20 Feb 2009 13:10:08 +0100 Subject: omap_hsmmc: Initialize hsmmc controller registers when resuming Most registers lose its state when the processor wakes up from sleep state. Thus registers should be initialized, when the processor wakes up. However the current hsmmc 'resume' function doesn't consider this issue and finally makes deadlock. So this patch fixes this problem. Signed-off-by: Kim Kyuwon Signed-off-by: Adrian Hunter Acked-by: Tony Lindgren Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 55 +++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 303a7970806d..576bfa7216b3 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -56,6 +56,7 @@ #define SDVS18 (0x5 << 9) #define SDVS30 (0x6 << 9) #define SDVS33 (0x7 << 9) +#define SDVS_MASK 0x00000E00 #define SDVSCLR 0xFFFFF1FF #define SDVSDET 0x00000400 #define AUTOIDLE 0x1 @@ -874,6 +875,34 @@ static int omap_hsmmc_get_ro(struct mmc_host *mmc) return pdata->slots[0].get_ro(host->dev, 0); } +static void omap_hsmmc_init(struct mmc_omap_host *host) +{ + u32 hctl, capa, value; + + /* Only MMC1 supports 3.0V */ + if (host->id == OMAP_MMC1_DEVID) { + hctl = SDVS30; + capa = VS30 | VS18; + } else { + hctl = SDVS18; + capa = VS18; + } + + value = OMAP_HSMMC_READ(host->base, HCTL) & ~SDVS_MASK; + OMAP_HSMMC_WRITE(host->base, HCTL, value | hctl); + + value = OMAP_HSMMC_READ(host->base, CAPA); + OMAP_HSMMC_WRITE(host->base, CAPA, value | capa); + + /* Set the controller to AUTO IDLE mode */ + value = OMAP_HSMMC_READ(host->base, SYSCONFIG); + OMAP_HSMMC_WRITE(host->base, SYSCONFIG, value | AUTOIDLE); + + /* Set SD bus power bit */ + value = OMAP_HSMMC_READ(host->base, HCTL); + OMAP_HSMMC_WRITE(host->base, HCTL, value | SDBP); +} + static struct mmc_host_ops mmc_omap_ops = { .request = omap_mmc_request, .set_ios = omap_mmc_set_ios, @@ -889,7 +918,6 @@ static int __init omap_mmc_probe(struct platform_device *pdev) struct mmc_omap_host *host = NULL; struct resource *res; int ret = 0, irq; - u32 hctl, capa; if (pdata == NULL) { dev_err(&pdev->dev, "Platform Data is missing\n"); @@ -994,28 +1022,7 @@ static int __init omap_mmc_probe(struct platform_device *pdev) if (pdata->slots[host->slot_id].wires >= 4) mmc->caps |= MMC_CAP_4_BIT_DATA; - /* Only MMC1 supports 3.0V */ - if (host->id == OMAP_MMC1_DEVID) { - hctl = SDVS30; - capa = VS30 | VS18; - } else { - hctl = SDVS18; - capa = VS18; - } - - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) | hctl); - - OMAP_HSMMC_WRITE(host->base, CAPA, - OMAP_HSMMC_READ(host->base, CAPA) | capa); - - /* Set the controller to AUTO IDLE mode */ - OMAP_HSMMC_WRITE(host->base, SYSCONFIG, - OMAP_HSMMC_READ(host->base, SYSCONFIG) | AUTOIDLE); - - /* Set SD bus power bit */ - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) | SDBP); + omap_hsmmc_init(host); /* Request IRQ for MMC operations */ ret = request_irq(host->irq, mmc_omap_irq, IRQF_DISABLED, @@ -1205,6 +1212,8 @@ static int omap_mmc_resume(struct platform_device *pdev) dev_dbg(mmc_dev(host->mmc), "Enabling debounce clk failed\n"); + omap_hsmmc_init(host); + if (host->pdata->resume) { ret = host->pdata->resume(&pdev->dev, host->slot_id); if (ret) -- cgit v1.2.3-59-g8ed1b From 0d6132ba0b006dd2bea9ba0c7b6b2f690cd95c40 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Thu, 5 Mar 2009 19:37:28 +0100 Subject: sdio: handle cis end marker in link field Signed-off-by: Pierre Ossman --- drivers/mmc/core/sdio_cis.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index 956bd7677502..6ba93f599281 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -227,6 +227,10 @@ static int sdio_read_cis(struct mmc_card *card, struct sdio_func *func) if (ret) break; + /* a size of 0xff also means we're done */ + if (tpl_link == 0xff) + break; + this = kmalloc(sizeof(*this) + tpl_link, GFP_KERNEL); if (!this) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b From c8d718f1037950107f13607ff0b696ffe63df76a Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Thu, 5 Mar 2009 19:38:38 +0100 Subject: sdio: handle null tuples Signed-off-by: Pierre Ossman --- drivers/mmc/core/sdio_cis.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index 6ba93f599281..963f2937c5e3 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -223,6 +223,10 @@ static int sdio_read_cis(struct mmc_card *card, struct sdio_func *func) if (tpl_code == 0xff) break; + /* null entries have no link field or data */ + if (tpl_code == 0x00) + continue; + ret = mmc_io_rw_direct(card, 0, 0, ptr++, 0, &tpl_link); if (ret) break; -- cgit v1.2.3-59-g8ed1b From be6f19fc24c937112d251232b3dae7e05e96aad1 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Thu, 5 Mar 2009 19:40:27 +0100 Subject: sdio: check that addresses are within the address space Signed-off-by: Pierre Ossman --- drivers/mmc/core/sdio_ops.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/mmc/core/sdio_ops.c b/drivers/mmc/core/sdio_ops.c index c8fa095a4488..4eb7825fd1a7 100644 --- a/drivers/mmc/core/sdio_ops.c +++ b/drivers/mmc/core/sdio_ops.c @@ -76,6 +76,10 @@ int mmc_io_rw_direct(struct mmc_card *card, int write, unsigned fn, BUG_ON(!card); BUG_ON(fn > 7); + /* sanity check */ + if (addr & ~0x1FFFF) + return -EINVAL; + memset(&cmd, 0, sizeof(struct mmc_command)); cmd.opcode = SD_IO_RW_DIRECT; @@ -125,6 +129,10 @@ int mmc_io_rw_extended(struct mmc_card *card, int write, unsigned fn, WARN_ON(blocks == 0); WARN_ON(blksz == 0); + /* sanity check */ + if (addr & ~0x1FFFF) + return -EINVAL; + memset(&mrq, 0, sizeof(struct mmc_request)); memset(&cmd, 0, sizeof(struct mmc_command)); memset(&data, 0, sizeof(struct mmc_data)); -- cgit v1.2.3-59-g8ed1b From d3096f88ac4596a509fc5fbe6fa7e7a5497eb399 Mon Sep 17 00:00:00 2001 From: Balaji Rao Date: Wed, 25 Feb 2009 15:28:09 +0530 Subject: mmc: During unsafe resume, select the right volatge for the card During mmc unsafe resume, choose the right voltage for the card after powerup. Although this has not seen to cause trouble, it's the wrong behaviour. Signed-off-by: Balaji Rao Signed-off-by: Pierre Ossman --- drivers/mmc/core/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index b5899e33b687..4a18d90556d9 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -877,6 +877,7 @@ int mmc_resume_host(struct mmc_host *host) mmc_bus_get(host); if (host->bus_ops && !host->bus_dead) { mmc_power_up(host); + mmc_select_voltage(host, host->ocr); BUG_ON(!host->bus_ops->resume); host->bus_ops->resume(host); } -- cgit v1.2.3-59-g8ed1b From b8e2006d5b17e58a18b38fc5a38769aad5f78d98 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Sat, 14 Mar 2009 21:17:32 +0100 Subject: sdhci: change list address Domain change of the sdhci development list. Signed-off-by: Pierre Ossman --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5d460c9d1c2c..e1fe959b9f83 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3834,7 +3834,7 @@ S: Maintained SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) DRIVER P: Pierre Ossman M: drzeus-sdhci@drzeus.cx -L: sdhci-devel@list.drzeus.cx +L: sdhci-devel@lists.ossman.eu S: Maintained SECURITY SUBSYSTEM -- cgit v1.2.3-59-g8ed1b From 48881caec426786cd451383ee53943cc5d3bfdeb Mon Sep 17 00:00:00 2001 From: Wolfgang Muees Date: Wed, 11 Mar 2009 14:13:15 +0100 Subject: mmc_spi: allow setting of spi mode 3 Allow the platform data structures to specify spi mode 3 (if there is a pullup on the clock line or the spi hardware is not able to serve spi mode 0). Signed-off-by: Wolfgang Muees Acked-by: David Brownell Signed-off-by: Pierre Ossman --- drivers/mmc/host/mmc_spi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 87e211df68ac..ad9e0e213f64 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1204,10 +1204,12 @@ static int mmc_spi_probe(struct spi_device *spi) /* MMC and SD specs only seem to care that sampling is on the * rising edge ... meaning SPI modes 0 or 3. So either SPI mode - * should be legit. We'll use mode 0 since it seems to be a - * bit less troublesome on some hardware ... unclear why. + * should be legit. We'll use mode 0 since the steady state is 0, + * which is appropriate for hotplugging, unless the platform data + * specify mode 3 (if hardware is not compatible to mode 0). */ - spi->mode = SPI_MODE_0; + if (spi->mode != SPI_MODE_3) + spi->mode = SPI_MODE_0; spi->bits_per_word = 8; status = spi_setup(spi); -- cgit v1.2.3-59-g8ed1b From ea15ba5cd7bb370902cd9f6a73c2d288bfba6b2c Mon Sep 17 00:00:00 2001 From: Wolfgang Muees Date: Wed, 11 Mar 2009 14:17:43 +0100 Subject: mmc_spi: wait more bytes for card response Some cards are slower than the standard allows and need more time to respond to a command. Max. observed number of bytes was 12. Signed-off-by: Wolfgang Muees Acked-by: David Brownell Signed-off-by: Pierre Ossman --- drivers/mmc/host/mmc_spi.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index ad9e0e213f64..69e1442ff2e2 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -279,8 +279,11 @@ static int mmc_spi_response_get(struct mmc_spi_host *host, * so it can always DMA directly into the target buffer. * It'd probably be better to memcpy() the first chunk and * avoid extra i/o calls... + * + * Note we check for more than 8 bytes, because in practice, + * some SD cards are slow... */ - for (i = 2; i < 9; i++) { + for (i = 2; i < 16; i++) { value = mmc_spi_readbytes(host, 1); if (value < 0) goto done; -- cgit v1.2.3-59-g8ed1b From c0c88871574ccb4ee53dde1bbb678931b38ed47b Mon Sep 17 00:00:00 2001 From: Wolfgang Muees Date: Wed, 11 Mar 2009 14:28:39 +0100 Subject: mmc_spi: allow higher timeouts for SPI mode Some SD cards have very high timeouts in SPI mode. So adjust the timeouts from theory to practice. Signed-off-by: Wolfgang Muees Acked-by: David Brownell Signed-off-by: Pierre Ossman --- drivers/mmc/core/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 4a18d90556d9..5c83c67b186c 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -297,6 +297,21 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card) data->timeout_clks = 0; } } + /* + * Some cards need very high timeouts if driven in SPI mode. + * The worst observed timeout was 900ms after writing a + * continuous stream of data until the internal logic + * overflowed. + */ + if (mmc_host_is_spi(card->host)) { + if (data->flags & MMC_DATA_WRITE) { + if (data->timeout_ns < 1000000000) + data->timeout_ns = 1000000000; /* 1s */ + } else { + if (data->timeout_ns < 100000000) + data->timeout_ns = 100000000; /* 100ms */ + } + } } EXPORT_SYMBOL(mmc_set_data_timeout); -- cgit v1.2.3-59-g8ed1b From bc6772a023ceab8df404b18b31c27f764dcf5b3f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 11 Mar 2009 21:58:54 +0900 Subject: tmio_mmc: Fix one off, use resource_size() in probe() Update the tmio_mmc code to use resource_size(). With this patch applied the correct resource size is passed to ioremap(). Signed-off-by: Magnus Damm Acked-by: Ian Molton Signed-off-by: Pierre Ossman --- drivers/mmc/host/tmio_mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 6a7a61904833..4f3e265d0203 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -568,11 +568,11 @@ static int __devinit tmio_mmc_probe(struct platform_device *dev) host->mmc = mmc; platform_set_drvdata(dev, mmc); - host->ctl = ioremap(res_ctl->start, res_ctl->end - res_ctl->start); + host->ctl = ioremap(res_ctl->start, resource_size(res_ctl)); if (!host->ctl) goto host_free; - host->cnf = ioremap(res_cnf->start, res_cnf->end - res_cnf->start); + host->cnf = ioremap(res_cnf->start, resource_size(res_cnf)); if (!host->cnf) goto unmap_ctl; -- cgit v1.2.3-59-g8ed1b From bedcc45c2e5d72b1c4b087b725c391441a93eee6 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 11 Mar 2009 21:59:03 +0900 Subject: tmio_mmc: Fix use after free in remove() Update the tmio_mmc code to call mmc_free_host() when done using the private data. Without this fix the driver frees memory and then keeps on using it as private data. Signed-off-by: Magnus Damm Acked-by: Ian Molton Signed-off-by: Pierre Ossman --- drivers/mmc/host/tmio_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 4f3e265d0203..63fbd5b7d312 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -650,10 +650,10 @@ static int __devexit tmio_mmc_remove(struct platform_device *dev) if (mmc) { struct tmio_mmc_host *host = mmc_priv(mmc); mmc_remove_host(mmc); - mmc_free_host(mmc); free_irq(host->irq, host); iounmap(host->ctl); iounmap(host->cnf); + mmc_free_host(mmc); } return 0; -- cgit v1.2.3-59-g8ed1b From 4cb32906e47987fee3fbfd79145d2348a2e8c5f3 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Sat, 14 Mar 2009 12:37:47 +0300 Subject: MMC: tmio_mmc.h: fix build problem drivers/mmc/host/tmio_mmc.h: In function 'tmio_mmc_kmap_atomic': drivers/mmc/host/tmio_mmc.h:147: error: implicit declaration of function 'kmap_atomic' drivers/mmc/host/tmio_mmc.h:147: error: 'KM_BIO_SRC_IRQ' undeclared (first use in this function) drivers/mmc/host/tmio_mmc.h: In function 'tmio_mmc_kunmap_atomic': drivers/mmc/host/tmio_mmc.h:153: error: implicit declaration of function 'kunmap_atomic' drivers/mmc/host/tmio_mmc.h:153: error: 'KM_BIO_SRC_IRQ' undeclared (first use in this function) Signed-off-by: Alexander Beregalov Signed-off-by: Pierre Ossman --- drivers/mmc/host/tmio_mmc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index ba2b4240a86a..9c831ab2ece6 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -8,6 +8,9 @@ * published by the Free Software Foundation. * */ + +#include + #define CNF_CMD 0x04 #define CNF_CTL_BASE 0x10 #define CNF_INT_PIN 0x3d -- cgit v1.2.3-59-g8ed1b From 236caa7cc351f885874a2776b7dd1b5667359dc8 Mon Sep 17 00:00:00 2001 From: Maen Suleiman Date: Sat, 14 Feb 2009 03:07:26 -0500 Subject: mmc: SDIO driver for Marvell SoCs This supports MMC/SD/SDIO currently found on the Kirkwood 88F6281 and 88F6192 SoC controllers. Signed-off-by: Nicolas Pitre Signed-off-by: Pierre Ossman --- arch/arm/plat-orion/include/plat/mvsdio.h | 21 + drivers/mmc/host/Kconfig | 11 + drivers/mmc/host/Makefile | 1 + drivers/mmc/host/mvsdio.c | 885 ++++++++++++++++++++++++++++++ drivers/mmc/host/mvsdio.h | 190 +++++++ 5 files changed, 1108 insertions(+) create mode 100644 arch/arm/plat-orion/include/plat/mvsdio.h create mode 100644 drivers/mmc/host/mvsdio.c create mode 100644 drivers/mmc/host/mvsdio.h diff --git a/arch/arm/plat-orion/include/plat/mvsdio.h b/arch/arm/plat-orion/include/plat/mvsdio.h new file mode 100644 index 000000000000..14ca88676002 --- /dev/null +++ b/arch/arm/plat-orion/include/plat/mvsdio.h @@ -0,0 +1,21 @@ +/* + * arch/arm/plat-orion/include/plat/mvsdio.h + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#ifndef __MACH_MVSDIO_H +#define __MACH_MVSDIO_H + +#include + +struct mvsdio_platform_data { + struct mbus_dram_target_info *dram; + unsigned int clock; + int gpio_card_detect; + int gpio_write_protect; +}; + +#endif diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 99d4b28d52ed..092baf66733c 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -171,6 +171,17 @@ config MMC_TIFM_SD To compile this driver as a module, choose M here: the module will be called tifm_sd. +config MMC_MVSDIO + tristate "Marvell MMC/SD/SDIO host driver" + depends on PLAT_ORION + ---help--- + This selects the Marvell SDIO host driver. + SDIO may currently be found on the Kirkwood 88F6281 and 88F6192 + SoC controllers. + + To compile this driver as a module, choose M here: the + module will be called mvsdio. + config MMC_SPI tristate "MMC/SD/SDIO over SPI" depends on SPI_MASTER && !HIGHMEM && HAS_DMA diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index dedec55861d9..c006eb47e592 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_MMC_OMAP_HS) += omap_hsmmc.o obj-$(CONFIG_MMC_AT91) += at91_mci.o obj-$(CONFIG_MMC_ATMELMCI) += atmel-mci.o obj-$(CONFIG_MMC_TIFM_SD) += tifm_sd.o +obj-$(CONFIG_MMC_MVSDIO) += mvsdio.o obj-$(CONFIG_MMC_SPI) += mmc_spi.o ifeq ($(CONFIG_OF),y) obj-$(CONFIG_MMC_SPI) += of_mmc_spi.o diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c new file mode 100644 index 000000000000..b5c375d94ab3 --- /dev/null +++ b/drivers/mmc/host/mvsdio.c @@ -0,0 +1,885 @@ +/* + * Marvell MMC/SD/SDIO driver + * + * Authors: Maen Suleiman, Nicolas Pitre + * Copyright (C) 2008-2009 Marvell Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "mvsdio.h" + +#define DRIVER_NAME "mvsdio" + +static int maxfreq = MVSD_CLOCKRATE_MAX; +static int nodma; + +struct mvsd_host { + void __iomem *base; + struct mmc_request *mrq; + spinlock_t lock; + unsigned int xfer_mode; + unsigned int intr_en; + unsigned int ctrl; + unsigned int pio_size; + void *pio_ptr; + unsigned int sg_frags; + unsigned int ns_per_clk; + unsigned int clock; + unsigned int base_clock; + struct timer_list timer; + struct mmc_host *mmc; + struct device *dev; + struct resource *res; + int irq; + int gpio_card_detect; + int gpio_write_protect; +}; + +#define mvsd_write(offs, val) writel(val, iobase + (offs)) +#define mvsd_read(offs) readl(iobase + (offs)) + +static int mvsd_setup_data(struct mvsd_host *host, struct mmc_data *data) +{ + void __iomem *iobase = host->base; + unsigned int tmout; + int tmout_index; + + /* If timeout=0 then maximum timeout index is used. */ + tmout = DIV_ROUND_UP(data->timeout_ns, host->ns_per_clk); + tmout += data->timeout_clks; + tmout_index = fls(tmout - 1) - 12; + if (tmout_index < 0) + tmout_index = 0; + if (tmout_index > MVSD_HOST_CTRL_TMOUT_MAX) + tmout_index = MVSD_HOST_CTRL_TMOUT_MAX; + + dev_dbg(host->dev, "data %s at 0x%08x: blocks=%d blksz=%d tmout=%u (%d)\n", + (data->flags & MMC_DATA_READ) ? "read" : "write", + (u32)sg_virt(data->sg), data->blocks, data->blksz, + tmout, tmout_index); + + host->ctrl &= ~MVSD_HOST_CTRL_TMOUT_MASK; + host->ctrl |= MVSD_HOST_CTRL_TMOUT(tmout_index); + mvsd_write(MVSD_HOST_CTRL, host->ctrl); + mvsd_write(MVSD_BLK_COUNT, data->blocks); + mvsd_write(MVSD_BLK_SIZE, data->blksz); + + if (nodma || (data->blksz | data->sg->offset) & 3) { + /* + * We cannot do DMA on a buffer which offset or size + * is not aligned on a 4-byte boundary. + */ + host->pio_size = data->blocks * data->blksz; + host->pio_ptr = sg_virt(data->sg); + if (!nodma) + printk(KERN_DEBUG "%s: fallback to PIO for data " + "at 0x%p size %d\n", + mmc_hostname(host->mmc), + host->pio_ptr, host->pio_size); + return 1; + } else { + dma_addr_t phys_addr; + int dma_dir = (data->flags & MMC_DATA_READ) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE; + host->sg_frags = dma_map_sg(mmc_dev(host->mmc), data->sg, + data->sg_len, dma_dir); + phys_addr = sg_dma_address(data->sg); + mvsd_write(MVSD_SYS_ADDR_LOW, (u32)phys_addr & 0xffff); + mvsd_write(MVSD_SYS_ADDR_HI, (u32)phys_addr >> 16); + return 0; + } +} + +static void mvsd_request(struct mmc_host *mmc, struct mmc_request *mrq) +{ + struct mvsd_host *host = mmc_priv(mmc); + void __iomem *iobase = host->base; + struct mmc_command *cmd = mrq->cmd; + u32 cmdreg = 0, xfer = 0, intr = 0; + unsigned long flags; + + BUG_ON(host->mrq != NULL); + host->mrq = mrq; + + dev_dbg(host->dev, "cmd %d (hw state 0x%04x)\n", + cmd->opcode, mvsd_read(MVSD_HW_STATE)); + + cmdreg = MVSD_CMD_INDEX(cmd->opcode); + + if (cmd->flags & MMC_RSP_BUSY) + cmdreg |= MVSD_CMD_RSP_48BUSY; + else if (cmd->flags & MMC_RSP_136) + cmdreg |= MVSD_CMD_RSP_136; + else if (cmd->flags & MMC_RSP_PRESENT) + cmdreg |= MVSD_CMD_RSP_48; + else + cmdreg |= MVSD_CMD_RSP_NONE; + + if (cmd->flags & MMC_RSP_CRC) + cmdreg |= MVSD_CMD_CHECK_CMDCRC; + + if (cmd->flags & MMC_RSP_OPCODE) + cmdreg |= MVSD_CMD_INDX_CHECK; + + if (cmd->flags & MMC_RSP_PRESENT) { + cmdreg |= MVSD_UNEXPECTED_RESP; + intr |= MVSD_NOR_UNEXP_RSP; + } + + if (mrq->data) { + struct mmc_data *data = mrq->data; + int pio; + + cmdreg |= MVSD_CMD_DATA_PRESENT | MVSD_CMD_CHECK_DATACRC16; + xfer |= MVSD_XFER_MODE_HW_WR_DATA_EN; + if (data->flags & MMC_DATA_READ) + xfer |= MVSD_XFER_MODE_TO_HOST; + + pio = mvsd_setup_data(host, data); + if (pio) { + xfer |= MVSD_XFER_MODE_PIO; + /* PIO section of mvsd_irq has comments on those bits */ + if (data->flags & MMC_DATA_WRITE) + intr |= MVSD_NOR_TX_AVAIL; + else if (host->pio_size > 32) + intr |= MVSD_NOR_RX_FIFO_8W; + else + intr |= MVSD_NOR_RX_READY; + } + + if (data->stop) { + struct mmc_command *stop = data->stop; + u32 cmd12reg = 0; + + mvsd_write(MVSD_AUTOCMD12_ARG_LOW, stop->arg & 0xffff); + mvsd_write(MVSD_AUTOCMD12_ARG_HI, stop->arg >> 16); + + if (stop->flags & MMC_RSP_BUSY) + cmd12reg |= MVSD_AUTOCMD12_BUSY; + if (stop->flags & MMC_RSP_OPCODE) + cmd12reg |= MVSD_AUTOCMD12_INDX_CHECK; + cmd12reg |= MVSD_AUTOCMD12_INDEX(stop->opcode); + mvsd_write(MVSD_AUTOCMD12_CMD, cmd12reg); + + xfer |= MVSD_XFER_MODE_AUTO_CMD12; + intr |= MVSD_NOR_AUTOCMD12_DONE; + } else { + intr |= MVSD_NOR_XFER_DONE; + } + } else { + intr |= MVSD_NOR_CMD_DONE; + } + + mvsd_write(MVSD_ARG_LOW, cmd->arg & 0xffff); + mvsd_write(MVSD_ARG_HI, cmd->arg >> 16); + + spin_lock_irqsave(&host->lock, flags); + + host->xfer_mode &= MVSD_XFER_MODE_INT_CHK_EN; + host->xfer_mode |= xfer; + mvsd_write(MVSD_XFER_MODE, host->xfer_mode); + + mvsd_write(MVSD_NOR_INTR_STATUS, ~MVSD_NOR_CARD_INT); + mvsd_write(MVSD_ERR_INTR_STATUS, 0xffff); + mvsd_write(MVSD_CMD, cmdreg); + + host->intr_en &= MVSD_NOR_CARD_INT; + host->intr_en |= intr | MVSD_NOR_ERROR; + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + mvsd_write(MVSD_ERR_INTR_EN, 0xffff); + + mod_timer(&host->timer, jiffies + 5 * HZ); + + spin_unlock_irqrestore(&host->lock, flags); +} + +static u32 mvsd_finish_cmd(struct mvsd_host *host, struct mmc_command *cmd, + u32 err_status) +{ + void __iomem *iobase = host->base; + + if (cmd->flags & MMC_RSP_136) { + unsigned int response[8], i; + for (i = 0; i < 8; i++) + response[i] = mvsd_read(MVSD_RSP(i)); + cmd->resp[0] = ((response[0] & 0x03ff) << 22) | + ((response[1] & 0xffff) << 6) | + ((response[2] & 0xfc00) >> 10); + cmd->resp[1] = ((response[2] & 0x03ff) << 22) | + ((response[3] & 0xffff) << 6) | + ((response[4] & 0xfc00) >> 10); + cmd->resp[2] = ((response[4] & 0x03ff) << 22) | + ((response[5] & 0xffff) << 6) | + ((response[6] & 0xfc00) >> 10); + cmd->resp[3] = ((response[6] & 0x03ff) << 22) | + ((response[7] & 0x3fff) << 8); + } else if (cmd->flags & MMC_RSP_PRESENT) { + unsigned int response[3], i; + for (i = 0; i < 3; i++) + response[i] = mvsd_read(MVSD_RSP(i)); + cmd->resp[0] = ((response[2] & 0x003f) << (8 - 8)) | + ((response[1] & 0xffff) << (14 - 8)) | + ((response[0] & 0x03ff) << (30 - 8)); + cmd->resp[1] = ((response[0] & 0xfc00) >> 10); + cmd->resp[2] = 0; + cmd->resp[3] = 0; + } + + if (err_status & MVSD_ERR_CMD_TIMEOUT) { + cmd->error = -ETIMEDOUT; + } else if (err_status & (MVSD_ERR_CMD_CRC | MVSD_ERR_CMD_ENDBIT | + MVSD_ERR_CMD_INDEX | MVSD_ERR_CMD_STARTBIT)) { + cmd->error = -EILSEQ; + } + err_status &= ~(MVSD_ERR_CMD_TIMEOUT | MVSD_ERR_CMD_CRC | + MVSD_ERR_CMD_ENDBIT | MVSD_ERR_CMD_INDEX | + MVSD_ERR_CMD_STARTBIT); + + return err_status; +} + +static u32 mvsd_finish_data(struct mvsd_host *host, struct mmc_data *data, + u32 err_status) +{ + void __iomem *iobase = host->base; + + if (host->pio_ptr) { + host->pio_ptr = NULL; + host->pio_size = 0; + } else { + dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->sg_frags, + (data->flags & MMC_DATA_READ) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } + + if (err_status & MVSD_ERR_DATA_TIMEOUT) + data->error = -ETIMEDOUT; + else if (err_status & (MVSD_ERR_DATA_CRC | MVSD_ERR_DATA_ENDBIT)) + data->error = -EILSEQ; + else if (err_status & MVSD_ERR_XFER_SIZE) + data->error = -EBADE; + err_status &= ~(MVSD_ERR_DATA_TIMEOUT | MVSD_ERR_DATA_CRC | + MVSD_ERR_DATA_ENDBIT | MVSD_ERR_XFER_SIZE); + + dev_dbg(host->dev, "data done: blocks_left=%d, bytes_left=%d\n", + mvsd_read(MVSD_CURR_BLK_LEFT), mvsd_read(MVSD_CURR_BYTE_LEFT)); + data->bytes_xfered = + (data->blocks - mvsd_read(MVSD_CURR_BLK_LEFT)) * data->blksz; + /* We can't be sure about the last block when errors are detected */ + if (data->bytes_xfered && data->error) + data->bytes_xfered -= data->blksz; + + /* Handle Auto cmd 12 response */ + if (data->stop) { + unsigned int response[3], i; + for (i = 0; i < 3; i++) + response[i] = mvsd_read(MVSD_AUTO_RSP(i)); + data->stop->resp[0] = ((response[2] & 0x003f) << (8 - 8)) | + ((response[1] & 0xffff) << (14 - 8)) | + ((response[0] & 0x03ff) << (30 - 8)); + data->stop->resp[1] = ((response[0] & 0xfc00) >> 10); + data->stop->resp[2] = 0; + data->stop->resp[3] = 0; + + if (err_status & MVSD_ERR_AUTOCMD12) { + u32 err_cmd12 = mvsd_read(MVSD_AUTOCMD12_ERR_STATUS); + dev_dbg(host->dev, "c12err 0x%04x\n", err_cmd12); + if (err_cmd12 & MVSD_AUTOCMD12_ERR_NOTEXE) + data->stop->error = -ENOEXEC; + else if (err_cmd12 & MVSD_AUTOCMD12_ERR_TIMEOUT) + data->stop->error = -ETIMEDOUT; + else if (err_cmd12) + data->stop->error = -EILSEQ; + err_status &= ~MVSD_ERR_AUTOCMD12; + } + } + + return err_status; +} + +static irqreturn_t mvsd_irq(int irq, void *dev) +{ + struct mvsd_host *host = dev; + void __iomem *iobase = host->base; + u32 intr_status, intr_done_mask; + int irq_handled = 0; + + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + dev_dbg(host->dev, "intr 0x%04x intr_en 0x%04x hw_state 0x%04x\n", + intr_status, mvsd_read(MVSD_NOR_INTR_EN), + mvsd_read(MVSD_HW_STATE)); + + spin_lock(&host->lock); + + /* PIO handling, if needed. Messy business... */ + if (host->pio_size && + (intr_status & host->intr_en & + (MVSD_NOR_RX_READY | MVSD_NOR_RX_FIFO_8W))) { + u16 *p = host->pio_ptr; + int s = host->pio_size; + while (s >= 32 && (intr_status & MVSD_NOR_RX_FIFO_8W)) { + readsw(iobase + MVSD_FIFO, p, 16); + p += 16; + s -= 32; + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + } + /* + * Normally we'd use < 32 here, but the RX_FIFO_8W bit + * doesn't appear to assert when there is exactly 32 bytes + * (8 words) left to fetch in a transfer. + */ + if (s <= 32) { + while (s >= 4 && (intr_status & MVSD_NOR_RX_READY)) { + put_unaligned(mvsd_read(MVSD_FIFO), p++); + put_unaligned(mvsd_read(MVSD_FIFO), p++); + s -= 4; + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + } + if (s && s < 4 && (intr_status & MVSD_NOR_RX_READY)) { + u16 val[2] = {0, 0}; + val[0] = mvsd_read(MVSD_FIFO); + val[1] = mvsd_read(MVSD_FIFO); + memcpy(p, &val, s); + s = 0; + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + } + if (s == 0) { + host->intr_en &= + ~(MVSD_NOR_RX_READY | MVSD_NOR_RX_FIFO_8W); + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + } else if (host->intr_en & MVSD_NOR_RX_FIFO_8W) { + host->intr_en &= ~MVSD_NOR_RX_FIFO_8W; + host->intr_en |= MVSD_NOR_RX_READY; + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + } + } + dev_dbg(host->dev, "pio %d intr 0x%04x hw_state 0x%04x\n", + s, intr_status, mvsd_read(MVSD_HW_STATE)); + host->pio_ptr = p; + host->pio_size = s; + irq_handled = 1; + } else if (host->pio_size && + (intr_status & host->intr_en & + (MVSD_NOR_TX_AVAIL | MVSD_NOR_TX_FIFO_8W))) { + u16 *p = host->pio_ptr; + int s = host->pio_size; + /* + * The TX_FIFO_8W bit is unreliable. When set, bursting + * 16 halfwords all at once in the FIFO drops data. Actually + * TX_AVAIL does go off after only one word is pushed even if + * TX_FIFO_8W remains set. + */ + while (s >= 4 && (intr_status & MVSD_NOR_TX_AVAIL)) { + mvsd_write(MVSD_FIFO, get_unaligned(p++)); + mvsd_write(MVSD_FIFO, get_unaligned(p++)); + s -= 4; + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + } + if (s < 4) { + if (s && (intr_status & MVSD_NOR_TX_AVAIL)) { + u16 val[2] = {0, 0}; + memcpy(&val, p, s); + mvsd_write(MVSD_FIFO, val[0]); + mvsd_write(MVSD_FIFO, val[1]); + s = 0; + intr_status = mvsd_read(MVSD_NOR_INTR_STATUS); + } + if (s == 0) { + host->intr_en &= + ~(MVSD_NOR_TX_AVAIL | MVSD_NOR_TX_FIFO_8W); + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + } + } + dev_dbg(host->dev, "pio %d intr 0x%04x hw_state 0x%04x\n", + s, intr_status, mvsd_read(MVSD_HW_STATE)); + host->pio_ptr = p; + host->pio_size = s; + irq_handled = 1; + } + + mvsd_write(MVSD_NOR_INTR_STATUS, intr_status); + + intr_done_mask = MVSD_NOR_CARD_INT | MVSD_NOR_RX_READY | + MVSD_NOR_RX_FIFO_8W | MVSD_NOR_TX_FIFO_8W; + if (intr_status & host->intr_en & ~intr_done_mask) { + struct mmc_request *mrq = host->mrq; + struct mmc_command *cmd = mrq->cmd; + u32 err_status = 0; + + del_timer(&host->timer); + host->mrq = NULL; + + host->intr_en &= MVSD_NOR_CARD_INT; + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + mvsd_write(MVSD_ERR_INTR_EN, 0); + + spin_unlock(&host->lock); + + if (intr_status & MVSD_NOR_UNEXP_RSP) { + cmd->error = -EPROTO; + } else if (intr_status & MVSD_NOR_ERROR) { + err_status = mvsd_read(MVSD_ERR_INTR_STATUS); + dev_dbg(host->dev, "err 0x%04x\n", err_status); + } + + err_status = mvsd_finish_cmd(host, cmd, err_status); + if (mrq->data) + err_status = mvsd_finish_data(host, mrq->data, err_status); + if (err_status) { + printk(KERN_ERR "%s: unhandled error status %#04x\n", + mmc_hostname(host->mmc), err_status); + cmd->error = -ENOMSG; + } + + mmc_request_done(host->mmc, mrq); + irq_handled = 1; + } else + spin_unlock(&host->lock); + + if (intr_status & MVSD_NOR_CARD_INT) { + mmc_signal_sdio_irq(host->mmc); + irq_handled = 1; + } + + if (irq_handled) + return IRQ_HANDLED; + + printk(KERN_ERR "%s: unhandled interrupt status=0x%04x en=0x%04x " + "pio=%d\n", mmc_hostname(host->mmc), intr_status, + host->intr_en, host->pio_size); + return IRQ_NONE; +} + +static void mvsd_timeout_timer(unsigned long data) +{ + struct mvsd_host *host = (struct mvsd_host *)data; + void __iomem *iobase = host->base; + struct mmc_request *mrq; + unsigned long flags; + + spin_lock_irqsave(&host->lock, flags); + mrq = host->mrq; + if (mrq) { + printk(KERN_ERR "%s: Timeout waiting for hardware interrupt.\n", + mmc_hostname(host->mmc)); + printk(KERN_ERR "%s: hw_state=0x%04x, intr_status=0x%04x " + "intr_en=0x%04x\n", mmc_hostname(host->mmc), + mvsd_read(MVSD_HW_STATE), + mvsd_read(MVSD_NOR_INTR_STATUS), + mvsd_read(MVSD_NOR_INTR_EN)); + + host->mrq = NULL; + + mvsd_write(MVSD_SW_RESET, MVSD_SW_RESET_NOW); + + host->xfer_mode &= MVSD_XFER_MODE_INT_CHK_EN; + mvsd_write(MVSD_XFER_MODE, host->xfer_mode); + + host->intr_en &= MVSD_NOR_CARD_INT; + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + mvsd_write(MVSD_ERR_INTR_EN, 0); + mvsd_write(MVSD_ERR_INTR_STATUS, 0xffff); + + mrq->cmd->error = -ETIMEDOUT; + mvsd_finish_cmd(host, mrq->cmd, 0); + if (mrq->data) { + mrq->data->error = -ETIMEDOUT; + mvsd_finish_data(host, mrq->data, 0); + } + } + spin_unlock_irqrestore(&host->lock, flags); + + if (mrq) + mmc_request_done(host->mmc, mrq); +} + +static irqreturn_t mvsd_card_detect_irq(int irq, void *dev) +{ + struct mvsd_host *host = dev; + mmc_detect_change(host->mmc, msecs_to_jiffies(100)); + return IRQ_HANDLED; +} + +static void mvsd_enable_sdio_irq(struct mmc_host *mmc, int enable) +{ + struct mvsd_host *host = mmc_priv(mmc); + void __iomem *iobase = host->base; + unsigned long flags; + + spin_lock_irqsave(&host->lock, flags); + if (enable) { + host->xfer_mode |= MVSD_XFER_MODE_INT_CHK_EN; + host->intr_en |= MVSD_NOR_CARD_INT; + } else { + host->xfer_mode &= ~MVSD_XFER_MODE_INT_CHK_EN; + host->intr_en &= ~MVSD_NOR_CARD_INT; + } + mvsd_write(MVSD_XFER_MODE, host->xfer_mode); + mvsd_write(MVSD_NOR_INTR_EN, host->intr_en); + spin_unlock_irqrestore(&host->lock, flags); +} + +static int mvsd_get_ro(struct mmc_host *mmc) +{ + struct mvsd_host *host = mmc_priv(mmc); + + if (host->gpio_write_protect) + return gpio_get_value(host->gpio_write_protect); + + /* + * Board doesn't support read only detection; let the mmc core + * decide what to do. + */ + return -ENOSYS; +} + +static void mvsd_power_up(struct mvsd_host *host) +{ + void __iomem *iobase = host->base; + dev_dbg(host->dev, "power up\n"); + mvsd_write(MVSD_NOR_INTR_EN, 0); + mvsd_write(MVSD_ERR_INTR_EN, 0); + mvsd_write(MVSD_SW_RESET, MVSD_SW_RESET_NOW); + mvsd_write(MVSD_XFER_MODE, 0); + mvsd_write(MVSD_NOR_STATUS_EN, 0xffff); + mvsd_write(MVSD_ERR_STATUS_EN, 0xffff); + mvsd_write(MVSD_NOR_INTR_STATUS, 0xffff); + mvsd_write(MVSD_ERR_INTR_STATUS, 0xffff); +} + +static void mvsd_power_down(struct mvsd_host *host) +{ + void __iomem *iobase = host->base; + dev_dbg(host->dev, "power down\n"); + mvsd_write(MVSD_NOR_INTR_EN, 0); + mvsd_write(MVSD_ERR_INTR_EN, 0); + mvsd_write(MVSD_SW_RESET, MVSD_SW_RESET_NOW); + mvsd_write(MVSD_XFER_MODE, MVSD_XFER_MODE_STOP_CLK); + mvsd_write(MVSD_NOR_STATUS_EN, 0); + mvsd_write(MVSD_ERR_STATUS_EN, 0); + mvsd_write(MVSD_NOR_INTR_STATUS, 0xffff); + mvsd_write(MVSD_ERR_INTR_STATUS, 0xffff); +} + +static void mvsd_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) +{ + struct mvsd_host *host = mmc_priv(mmc); + void __iomem *iobase = host->base; + u32 ctrl_reg = 0; + + if (ios->power_mode == MMC_POWER_UP) + mvsd_power_up(host); + + if (ios->clock == 0) { + mvsd_write(MVSD_XFER_MODE, MVSD_XFER_MODE_STOP_CLK); + mvsd_write(MVSD_CLK_DIV, MVSD_BASE_DIV_MAX); + host->clock = 0; + dev_dbg(host->dev, "clock off\n"); + } else if (ios->clock != host->clock) { + u32 m = DIV_ROUND_UP(host->base_clock, ios->clock) - 1; + if (m > MVSD_BASE_DIV_MAX) + m = MVSD_BASE_DIV_MAX; + mvsd_write(MVSD_CLK_DIV, m); + host->clock = ios->clock; + host->ns_per_clk = 1000000000 / (host->base_clock / (m+1)); + dev_dbg(host->dev, "clock=%d (%d), div=0x%04x\n", + ios->clock, host->base_clock / (m+1), m); + } + + /* default transfer mode */ + ctrl_reg |= MVSD_HOST_CTRL_BIG_ENDIAN; + ctrl_reg &= ~MVSD_HOST_CTRL_LSB_FIRST; + + /* default to maximum timeout */ + ctrl_reg |= MVSD_HOST_CTRL_TMOUT_MASK; + ctrl_reg |= MVSD_HOST_CTRL_TMOUT_EN; + + if (ios->bus_mode == MMC_BUSMODE_PUSHPULL) + ctrl_reg |= MVSD_HOST_CTRL_PUSH_PULL_EN; + + if (ios->bus_width == MMC_BUS_WIDTH_4) + ctrl_reg |= MVSD_HOST_CTRL_DATA_WIDTH_4_BITS; + + if (ios->timing == MMC_TIMING_MMC_HS || + ios->timing == MMC_TIMING_SD_HS) + ctrl_reg |= MVSD_HOST_CTRL_HI_SPEED_EN; + + host->ctrl = ctrl_reg; + mvsd_write(MVSD_HOST_CTRL, ctrl_reg); + dev_dbg(host->dev, "ctrl 0x%04x: %s %s %s\n", ctrl_reg, + (ctrl_reg & MVSD_HOST_CTRL_PUSH_PULL_EN) ? + "push-pull" : "open-drain", + (ctrl_reg & MVSD_HOST_CTRL_DATA_WIDTH_4_BITS) ? + "4bit-width" : "1bit-width", + (ctrl_reg & MVSD_HOST_CTRL_HI_SPEED_EN) ? + "high-speed" : ""); + + if (ios->power_mode == MMC_POWER_OFF) + mvsd_power_down(host); +} + +static const struct mmc_host_ops mvsd_ops = { + .request = mvsd_request, + .get_ro = mvsd_get_ro, + .set_ios = mvsd_set_ios, + .enable_sdio_irq = mvsd_enable_sdio_irq, +}; + +static void __init mv_conf_mbus_windows(struct mvsd_host *host, + struct mbus_dram_target_info *dram) +{ + void __iomem *iobase = host->base; + int i; + + for (i = 0; i < 4; i++) { + writel(0, iobase + MVSD_WINDOW_CTRL(i)); + writel(0, iobase + MVSD_WINDOW_BASE(i)); + } + + for (i = 0; i < dram->num_cs; i++) { + struct mbus_dram_window *cs = dram->cs + i; + writel(((cs->size - 1) & 0xffff0000) | + (cs->mbus_attr << 8) | + (dram->mbus_dram_target_id << 4) | 1, + iobase + MVSD_WINDOW_CTRL(i)); + writel(cs->base, iobase + MVSD_WINDOW_BASE(i)); + } +} + +static int __init mvsd_probe(struct platform_device *pdev) +{ + struct mmc_host *mmc = NULL; + struct mvsd_host *host = NULL; + const struct mvsdio_platform_data *mvsd_data; + struct resource *r; + int ret, irq; + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + irq = platform_get_irq(pdev, 0); + mvsd_data = pdev->dev.platform_data; + if (!r || irq < 0 || !mvsd_data) + return -ENXIO; + + r = request_mem_region(r->start, SZ_1K, DRIVER_NAME); + if (!r) + return -EBUSY; + + mmc = mmc_alloc_host(sizeof(struct mvsd_host), &pdev->dev); + if (!mmc) { + ret = -ENOMEM; + goto out; + } + + host = mmc_priv(mmc); + host->mmc = mmc; + host->dev = &pdev->dev; + host->res = r; + host->base_clock = mvsd_data->clock / 2; + + mmc->ops = &mvsd_ops; + + mmc->ocr_avail = MMC_VDD_32_33 | MMC_VDD_33_34; + mmc->caps = MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ | + MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED; + + mmc->f_min = DIV_ROUND_UP(host->base_clock, MVSD_BASE_DIV_MAX); + mmc->f_max = maxfreq; + + mmc->max_blk_size = 2048; + mmc->max_blk_count = 65535; + + mmc->max_hw_segs = 1; + mmc->max_phys_segs = 1; + mmc->max_seg_size = mmc->max_blk_size * mmc->max_blk_count; + mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count; + + spin_lock_init(&host->lock); + + host->base = ioremap(r->start, SZ_4K); + if (!host->base) { + ret = -ENOMEM; + goto out; + } + + /* (Re-)program MBUS remapping windows if we are asked to. */ + if (mvsd_data->dram != NULL) + mv_conf_mbus_windows(host, mvsd_data->dram); + + mvsd_power_down(host); + + ret = request_irq(irq, mvsd_irq, 0, DRIVER_NAME, host); + if (ret) { + printk(KERN_ERR "%s: cannot assign irq %d\n", DRIVER_NAME, irq); + goto out; + } else + host->irq = irq; + + if (mvsd_data->gpio_card_detect) { + ret = gpio_request(mvsd_data->gpio_card_detect, + DRIVER_NAME " cd"); + if (ret == 0) { + gpio_direction_input(mvsd_data->gpio_card_detect); + irq = gpio_to_irq(mvsd_data->gpio_card_detect); + ret = request_irq(irq, mvsd_card_detect_irq, + IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING, + DRIVER_NAME " cd", host); + if (ret == 0) + host->gpio_card_detect = + mvsd_data->gpio_card_detect; + else + gpio_free(mvsd_data->gpio_card_detect); + } + } + if (!host->gpio_card_detect) + mmc->caps |= MMC_CAP_NEEDS_POLL; + + if (mvsd_data->gpio_write_protect) { + ret = gpio_request(mvsd_data->gpio_write_protect, + DRIVER_NAME " wp"); + if (ret == 0) { + gpio_direction_input(mvsd_data->gpio_write_protect); + host->gpio_write_protect = + mvsd_data->gpio_write_protect; + } + } + + setup_timer(&host->timer, mvsd_timeout_timer, (unsigned long)host); + platform_set_drvdata(pdev, mmc); + ret = mmc_add_host(mmc); + if (ret) + goto out; + + printk(KERN_NOTICE "%s: %s driver initialized, ", + mmc_hostname(mmc), DRIVER_NAME); + if (host->gpio_card_detect) + printk("using GPIO %d for card detection\n", + host->gpio_card_detect); + else + printk("lacking card detect (fall back to polling)\n"); + return 0; + +out: + if (host) { + if (host->irq) + free_irq(host->irq, host); + if (host->gpio_card_detect) { + free_irq(gpio_to_irq(host->gpio_card_detect), host); + gpio_free(host->gpio_card_detect); + } + if (host->gpio_write_protect) + gpio_free(host->gpio_write_protect); + if (host->base) + iounmap(host->base); + } + if (r) + release_resource(r); + if (mmc) + mmc_free_host(mmc); + + return ret; +} + +static int __exit mvsd_remove(struct platform_device *pdev) +{ + struct mmc_host *mmc = platform_get_drvdata(pdev); + + if (mmc) { + struct mvsd_host *host = mmc_priv(mmc); + + if (host->gpio_card_detect) { + free_irq(gpio_to_irq(host->gpio_card_detect), host); + gpio_free(host->gpio_card_detect); + } + mmc_remove_host(mmc); + free_irq(host->irq, host); + if (host->gpio_write_protect) + gpio_free(host->gpio_write_protect); + del_timer_sync(&host->timer); + mvsd_power_down(host); + iounmap(host->base); + release_resource(host->res); + mmc_free_host(mmc); + } + platform_set_drvdata(pdev, NULL); + return 0; +} + +#ifdef CONFIG_PM +static int mvsd_suspend(struct platform_device *dev, pm_message_t state, + u32 level) +{ + struct mmc_host *mmc = platform_get_drvdata(dev); + int ret = 0; + + if (mmc && level == SUSPEND_DISABLE) + ret = mmc_suspend_host(mmc, state); + + return ret; +} + +static int mvsd_resume(struct platform_device *dev, u32 level) +{ + struct mmc_host *mmc = platform_dev_get_drvdata(dev); + int ret = 0; + + if (mmc && level == RESUME_ENABLE) + ret = mmc_resume_host(mmc); + + return ret; +} +#else +#define mvsd_suspend NULL +#define mvsd_resume NULL +#endif + +static struct platform_driver mvsd_driver = { + .remove = __exit_p(mvsd_remove), + .suspend = mvsd_suspend, + .resume = mvsd_resume, + .driver = { + .name = DRIVER_NAME, + }, +}; + +static int __init mvsd_init(void) +{ + return platform_driver_probe(&mvsd_driver, mvsd_probe); +} + +static void __exit mvsd_exit(void) +{ + platform_driver_unregister(&mvsd_driver); +} + +module_init(mvsd_init); +module_exit(mvsd_exit); + +/* maximum card clock frequency (default 50MHz) */ +module_param(maxfreq, int, 0); + +/* force PIO transfers all the time */ +module_param(nodma, int, 0); + +MODULE_AUTHOR("Maen Suleiman, Nicolas Pitre"); +MODULE_DESCRIPTION("Marvell MMC,SD,SDIO Host Controller driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mmc/host/mvsdio.h b/drivers/mmc/host/mvsdio.h new file mode 100644 index 000000000000..7d9727b9f5aa --- /dev/null +++ b/drivers/mmc/host/mvsdio.h @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2008 Marvell Semiconductors, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MVSDIO_H +#define __MVSDIO_H + +/* + * Clock rates + */ + +#define MVSD_CLOCKRATE_MAX 50000000 +#define MVSD_BASE_DIV_MAX 0x7ff + + +/* + * Register offsets + */ + +#define MVSD_SYS_ADDR_LOW 0x000 +#define MVSD_SYS_ADDR_HI 0x004 +#define MVSD_BLK_SIZE 0x008 +#define MVSD_BLK_COUNT 0x00c +#define MVSD_ARG_LOW 0x010 +#define MVSD_ARG_HI 0x014 +#define MVSD_XFER_MODE 0x018 +#define MVSD_CMD 0x01c +#define MVSD_RSP(i) (0x020 + ((i)<<2)) +#define MVSD_RSP0 0x020 +#define MVSD_RSP1 0x024 +#define MVSD_RSP2 0x028 +#define MVSD_RSP3 0x02c +#define MVSD_RSP4 0x030 +#define MVSD_RSP5 0x034 +#define MVSD_RSP6 0x038 +#define MVSD_RSP7 0x03c +#define MVSD_FIFO 0x040 +#define MVSD_RSP_CRC7 0x044 +#define MVSD_HW_STATE 0x048 +#define MVSD_HOST_CTRL 0x050 +#define MVSD_BLK_GAP_CTRL 0x054 +#define MVSD_CLK_CTRL 0x058 +#define MVSD_SW_RESET 0x05c +#define MVSD_NOR_INTR_STATUS 0x060 +#define MVSD_ERR_INTR_STATUS 0x064 +#define MVSD_NOR_STATUS_EN 0x068 +#define MVSD_ERR_STATUS_EN 0x06c +#define MVSD_NOR_INTR_EN 0x070 +#define MVSD_ERR_INTR_EN 0x074 +#define MVSD_AUTOCMD12_ERR_STATUS 0x078 +#define MVSD_CURR_BYTE_LEFT 0x07c +#define MVSD_CURR_BLK_LEFT 0x080 +#define MVSD_AUTOCMD12_ARG_LOW 0x084 +#define MVSD_AUTOCMD12_ARG_HI 0x088 +#define MVSD_AUTOCMD12_CMD 0x08c +#define MVSD_AUTO_RSP(i) (0x090 + ((i)<<2)) +#define MVSD_AUTO_RSP0 0x090 +#define MVSD_AUTO_RSP1 0x094 +#define MVSD_AUTO_RSP2 0x098 +#define MVSD_CLK_DIV 0x128 + +#define MVSD_WINDOW_CTRL(i) (0x108 + ((i) << 3)) +#define MVSD_WINDOW_BASE(i) (0x10c + ((i) << 3)) + + +/* + * MVSD_CMD + */ + +#define MVSD_CMD_RSP_NONE (0 << 0) +#define MVSD_CMD_RSP_136 (1 << 0) +#define MVSD_CMD_RSP_48 (2 << 0) +#define MVSD_CMD_RSP_48BUSY (3 << 0) + +#define MVSD_CMD_CHECK_DATACRC16 (1 << 2) +#define MVSD_CMD_CHECK_CMDCRC (1 << 3) +#define MVSD_CMD_INDX_CHECK (1 << 4) +#define MVSD_CMD_DATA_PRESENT (1 << 5) +#define MVSD_UNEXPECTED_RESP (1 << 7) +#define MVSD_CMD_INDEX(x) ((x) << 8) + + +/* + * MVSD_AUTOCMD12_CMD + */ + +#define MVSD_AUTOCMD12_BUSY (1 << 0) +#define MVSD_AUTOCMD12_INDX_CHECK (1 << 1) +#define MVSD_AUTOCMD12_INDEX(x) ((x) << 8) + +/* + * MVSD_XFER_MODE + */ + +#define MVSD_XFER_MODE_WR_DATA_START (1 << 0) +#define MVSD_XFER_MODE_HW_WR_DATA_EN (1 << 1) +#define MVSD_XFER_MODE_AUTO_CMD12 (1 << 2) +#define MVSD_XFER_MODE_INT_CHK_EN (1 << 3) +#define MVSD_XFER_MODE_TO_HOST (1 << 4) +#define MVSD_XFER_MODE_STOP_CLK (1 << 5) +#define MVSD_XFER_MODE_PIO (1 << 6) + + +/* + * MVSD_HOST_CTRL + */ + +#define MVSD_HOST_CTRL_PUSH_PULL_EN (1 << 0) + +#define MVSD_HOST_CTRL_CARD_TYPE_MEM_ONLY (0 << 1) +#define MVSD_HOST_CTRL_CARD_TYPE_IO_ONLY (1 << 1) +#define MVSD_HOST_CTRL_CARD_TYPE_IO_MEM_COMBO (2 << 1) +#define MVSD_HOST_CTRL_CARD_TYPE_IO_MMC (3 << 1) +#define MVSD_HOST_CTRL_CARD_TYPE_MASK (3 << 1) + +#define MVSD_HOST_CTRL_BIG_ENDIAN (1 << 3) +#define MVSD_HOST_CTRL_LSB_FIRST (1 << 4) +#define MVSD_HOST_CTRL_DATA_WIDTH_4_BITS (1 << 9) +#define MVSD_HOST_CTRL_HI_SPEED_EN (1 << 10) + +#define MVSD_HOST_CTRL_TMOUT_MAX 0xf +#define MVSD_HOST_CTRL_TMOUT_MASK (0xf << 11) +#define MVSD_HOST_CTRL_TMOUT(x) ((x) << 11) +#define MVSD_HOST_CTRL_TMOUT_EN (1 << 15) + + +/* + * MVSD_SW_RESET + */ + +#define MVSD_SW_RESET_NOW (1 << 8) + + +/* + * Normal interrupt status bits + */ + +#define MVSD_NOR_CMD_DONE (1 << 0) +#define MVSD_NOR_XFER_DONE (1 << 1) +#define MVSD_NOR_BLK_GAP_EVT (1 << 2) +#define MVSD_NOR_DMA_DONE (1 << 3) +#define MVSD_NOR_TX_AVAIL (1 << 4) +#define MVSD_NOR_RX_READY (1 << 5) +#define MVSD_NOR_CARD_INT (1 << 8) +#define MVSD_NOR_READ_WAIT_ON (1 << 9) +#define MVSD_NOR_RX_FIFO_8W (1 << 10) +#define MVSD_NOR_TX_FIFO_8W (1 << 11) +#define MVSD_NOR_SUSPEND_ON (1 << 12) +#define MVSD_NOR_AUTOCMD12_DONE (1 << 13) +#define MVSD_NOR_UNEXP_RSP (1 << 14) +#define MVSD_NOR_ERROR (1 << 15) + + +/* + * Error status bits + */ + +#define MVSD_ERR_CMD_TIMEOUT (1 << 0) +#define MVSD_ERR_CMD_CRC (1 << 1) +#define MVSD_ERR_CMD_ENDBIT (1 << 2) +#define MVSD_ERR_CMD_INDEX (1 << 3) +#define MVSD_ERR_DATA_TIMEOUT (1 << 4) +#define MVSD_ERR_DATA_CRC (1 << 5) +#define MVSD_ERR_DATA_ENDBIT (1 << 6) +#define MVSD_ERR_AUTOCMD12 (1 << 8) +#define MVSD_ERR_CMD_STARTBIT (1 << 9) +#define MVSD_ERR_XFER_SIZE (1 << 10) +#define MVSD_ERR_RESP_T_BIT (1 << 11) +#define MVSD_ERR_CRC_ENDBIT (1 << 12) +#define MVSD_ERR_CRC_STARTBIT (1 << 13) +#define MVSD_ERR_CRC_STATUS (1 << 14) + + +/* + * CMD12 error status bits + */ + +#define MVSD_AUTOCMD12_ERR_NOTEXE (1 << 0) +#define MVSD_AUTOCMD12_ERR_TIMEOUT (1 << 1) +#define MVSD_AUTOCMD12_ERR_CRC (1 << 2) +#define MVSD_AUTOCMD12_ERR_ENDBIT (1 << 3) +#define MVSD_AUTOCMD12_ERR_INDEX (1 << 4) +#define MVSD_AUTOCMD12_ERR_RESP_T_BIT (1 << 5) +#define MVSD_AUTOCMD12_ERR_RESP_STARTBIT (1 << 6) + +#endif -- cgit v1.2.3-59-g8ed1b From 2a69567b875b0650ffe29a0f2441e2068e4e8294 Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Mon, 16 Mar 2009 19:52:26 +0100 Subject: mmc: add maintainer for mvsdio driver Nicolas Pitre accepted to look after the mvsdio driver. Signed-off-by: Pierre Ossman --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e1fe959b9f83..d17916ce2155 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2891,6 +2891,12 @@ M: buytenh@marvell.com L: netdev@vger.kernel.org S: Supported +MARVELL SOC MMC/SD/SDIO CONTROLLER DRIVER +P: Nicolas Pitre +M: nico@cam.org +L: linux-kernel@vger.kernel.org +S: Maintained + MARVELL YUKON / SYSKONNECT DRIVER P: Mirko Lindner M: mlindner@syskonnect.de -- cgit v1.2.3-59-g8ed1b From 82788ff532f75ecd23166e677c970139ff61c363 Mon Sep 17 00:00:00 2001 From: Jarkko Lavinen Date: Fri, 5 Dec 2008 12:31:46 +0200 Subject: omap_hsmmc: Do dma cleanup also with data CRC errors Signed-off-by: Jarkko Lavinen Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 576bfa7216b3..5ff2ca22beea 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -332,9 +332,9 @@ mmc_omap_cmd_done(struct mmc_omap_host *host, struct mmc_command *cmd) /* * DMA clean up for command errors */ -static void mmc_dma_cleanup(struct mmc_omap_host *host) +static void mmc_dma_cleanup(struct mmc_omap_host *host, int errno) { - host->data->error = -ETIMEDOUT; + host->data->error = errno; if (host->use_dma && host->dma_ch != -1) { dma_unmap_sg(mmc_dev(host->mmc), host->data->sg, host->dma_len, @@ -439,7 +439,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) end_cmd = 1; } if (host->data) { - mmc_dma_cleanup(host); + mmc_dma_cleanup(host, -ETIMEDOUT); mmc_omap_reset_controller_fsm(host, SRD); } } @@ -447,9 +447,9 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) (status & DATA_CRC)) { if (host->data) { if (status & DATA_TIMEOUT) - mmc_dma_cleanup(host); + mmc_dma_cleanup(host, -ETIMEDOUT); else - host->data->error = -EILSEQ; + mmc_dma_cleanup(host, -EILSEQ); mmc_omap_reset_controller_fsm(host, SRD); end_trans = 1; } -- cgit v1.2.3-59-g8ed1b From 4a694dc915c9a223044ce21fc0d99e63facd1d64 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 12 Jan 2009 16:13:08 +0200 Subject: omap_hsmmc: Fix response type for busy after response Some MMC commands result in the card becoming busy after the response is received. This needs to be specified for the omap_hsmmc host controller, which is what this patch does. However, the effect is that some commands with no data will cause a Transfer Complete (TC) interrupt in addition to the Command Complete (CC) interrupt. In order to deal with that, the irq handler has needed a few changes also. The benefit of this change is that the omap_hsmmc host controller driver now waits for the TC interrupt while the card is busy, so the mmc_block driver needs to poll the card status just once instead of repeatedly. i.e. the net result is more sleep and less cpu. The command sequence for open-ended multi-block write with DMA is now: Issue write command CMD25 Receive CC interrupt Data is sent Receive TC interrupt (DMA is done) Issue stop command CMD12 Receive CC interrupt Card is busy Receive TC interrupt Card is now ready for next transfer Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 5ff2ca22beea..1f84bd4a3b27 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -150,6 +150,7 @@ struct mmc_omap_host { int initstr; int slot_id; int dbclk_enabled; + int response_busy; struct omap_mmc_platform_data *pdata; }; @@ -244,10 +245,14 @@ mmc_omap_start_command(struct mmc_omap_host *host, struct mmc_command *cmd, OMAP_HSMMC_WRITE(host->base, ISE, INT_EN_MASK); OMAP_HSMMC_WRITE(host->base, IE, INT_EN_MASK); + host->response_busy = 0; if (cmd->flags & MMC_RSP_PRESENT) { if (cmd->flags & MMC_RSP_136) resptype = 1; - else + else if (cmd->flags & MMC_RSP_BUSY) { + resptype = 3; + host->response_busy = 1; + } else resptype = 2; } @@ -282,6 +287,15 @@ mmc_omap_start_command(struct mmc_omap_host *host, struct mmc_command *cmd, static void mmc_omap_xfer_done(struct mmc_omap_host *host, struct mmc_data *data) { + if (!data) { + struct mmc_request *mrq = host->mrq; + + host->mrq = NULL; + mmc_omap_fclk_lazy_disable(host); + mmc_request_done(host->mmc, mrq); + return; + } + host->data = NULL; if (host->use_dma && host->dma_ch != -1) @@ -323,7 +337,7 @@ mmc_omap_cmd_done(struct mmc_omap_host *host, struct mmc_command *cmd) cmd->resp[0] = OMAP_HSMMC_READ(host->base, RSP10); } } - if (host->data == NULL || cmd->error) { + if ((host->data == NULL && !host->response_busy) || cmd->error) { host->mrq = NULL; mmc_request_done(host->mmc, cmd->mrq); } @@ -413,7 +427,7 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) struct mmc_data *data; int end_cmd = 0, end_trans = 0, status; - if (host->cmd == NULL && host->data == NULL) { + if (host->mrq == NULL) { OMAP_HSMMC_WRITE(host->base, STAT, OMAP_HSMMC_READ(host->base, STAT)); return IRQ_HANDLED; @@ -438,18 +452,24 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) } end_cmd = 1; } - if (host->data) { - mmc_dma_cleanup(host, -ETIMEDOUT); + if (host->data || host->response_busy) { + if (host->data) + mmc_dma_cleanup(host, -ETIMEDOUT); + host->response_busy = 0; mmc_omap_reset_controller_fsm(host, SRD); } } if ((status & DATA_TIMEOUT) || (status & DATA_CRC)) { - if (host->data) { - if (status & DATA_TIMEOUT) - mmc_dma_cleanup(host, -ETIMEDOUT); + if (host->data || host->response_busy) { + int err = (status & DATA_TIMEOUT) ? + -ETIMEDOUT : -EILSEQ; + + if (host->data) + mmc_dma_cleanup(host, err); else - mmc_dma_cleanup(host, -EILSEQ); + host->mrq->cmd->error = err; + host->response_busy = 0; mmc_omap_reset_controller_fsm(host, SRD); end_trans = 1; } -- cgit v1.2.3-59-g8ed1b From 0ccd76d4c236a0cf71efe51848f15c3f5d951da7 Mon Sep 17 00:00:00 2001 From: Juha Yrjola Date: Fri, 14 Nov 2008 15:22:00 +0200 Subject: omap_hsmmc: Implement scatter-gather emulation Instead of using the bounce buffer, using scatter-gather emulation (as in the OMAP1/2 MMC driver) removes the need of one extra memory copy and improves performance. Signed-off-by: Juha Yrjola Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 172 +++++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 77 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 1f84bd4a3b27..483e591f7bbb 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -100,9 +100,6 @@ #define OMAP_MMC1_DEVID 0 #define OMAP_MMC2_DEVID 1 -#define OMAP_MMC_DATADIR_NONE 0 -#define OMAP_MMC_DATADIR_READ 1 -#define OMAP_MMC_DATADIR_WRITE 2 #define MMC_TIMEOUT_MS 20 #define OMAP_MMC_MASTER_CLOCK 96000000 #define DRIVER_NAME "mmci-omap-hs" @@ -138,16 +135,14 @@ struct mmc_omap_host { resource_size_t mapbase; unsigned int id; unsigned int dma_len; - unsigned int dma_dir; + unsigned int dma_sg_idx; unsigned char bus_mode; - unsigned char datadir; u32 *buffer; u32 bytesleft; int suspended; int irq; int carddetect; int use_dma, dma_ch; - int initstr; int slot_id; int dbclk_enabled; int response_busy; @@ -281,6 +276,15 @@ mmc_omap_start_command(struct mmc_omap_host *host, struct mmc_command *cmd, OMAP_HSMMC_WRITE(host->base, CMD, cmdreg); } +static int +mmc_omap_get_dma_dir(struct mmc_omap_host *host, struct mmc_data *data) +{ + if (data->flags & MMC_DATA_WRITE) + return DMA_TO_DEVICE; + else + return DMA_FROM_DEVICE; +} + /* * Notify the transfer complete to MMC core */ @@ -300,9 +304,7 @@ mmc_omap_xfer_done(struct mmc_omap_host *host, struct mmc_data *data) if (host->use_dma && host->dma_ch != -1) dma_unmap_sg(mmc_dev(host->mmc), data->sg, host->dma_len, - host->dma_dir); - - host->datadir = OMAP_MMC_DATADIR_NONE; + mmc_omap_get_dma_dir(host, data)); if (!data->error) data->bytes_xfered += data->blocks * (data->blksz); @@ -352,13 +354,12 @@ static void mmc_dma_cleanup(struct mmc_omap_host *host, int errno) if (host->use_dma && host->dma_ch != -1) { dma_unmap_sg(mmc_dev(host->mmc), host->data->sg, host->dma_len, - host->dma_dir); + mmc_omap_get_dma_dir(host, host->data)); omap_free_dma(host->dma_ch); host->dma_ch = -1; up(&host->sem); } host->data = NULL; - host->datadir = OMAP_MMC_DATADIR_NONE; } /* @@ -592,6 +593,55 @@ static irqreturn_t omap_mmc_cd_handler(int irq, void *dev_id) return IRQ_HANDLED; } +static int mmc_omap_get_dma_sync_dev(struct mmc_omap_host *host, + struct mmc_data *data) +{ + int sync_dev; + + if (data->flags & MMC_DATA_WRITE) { + if (host->id == OMAP_MMC1_DEVID) + sync_dev = OMAP24XX_DMA_MMC1_TX; + else + sync_dev = OMAP24XX_DMA_MMC2_TX; + } else { + if (host->id == OMAP_MMC1_DEVID) + sync_dev = OMAP24XX_DMA_MMC1_RX; + else + sync_dev = OMAP24XX_DMA_MMC2_RX; + } + return sync_dev; +} + +static void mmc_omap_config_dma_params(struct mmc_omap_host *host, + struct mmc_data *data, + struct scatterlist *sgl) +{ + int blksz, nblk, dma_ch; + + dma_ch = host->dma_ch; + if (data->flags & MMC_DATA_WRITE) { + omap_set_dma_dest_params(dma_ch, 0, OMAP_DMA_AMODE_CONSTANT, + (host->mapbase + OMAP_HSMMC_DATA), 0, 0); + omap_set_dma_src_params(dma_ch, 0, OMAP_DMA_AMODE_POST_INC, + sg_dma_address(sgl), 0, 0); + } else { + omap_set_dma_src_params(dma_ch, 0, OMAP_DMA_AMODE_CONSTANT, + (host->mapbase + OMAP_HSMMC_DATA), 0, 0); + omap_set_dma_dest_params(dma_ch, 0, OMAP_DMA_AMODE_POST_INC, + sg_dma_address(sgl), 0, 0); + } + + blksz = host->data->blksz; + nblk = sg_dma_len(sgl) / blksz; + + omap_set_dma_transfer_params(dma_ch, OMAP_DMA_DATA_TYPE_S32, + blksz / 4, nblk, OMAP_DMA_SYNC_FRAME, + mmc_omap_get_dma_sync_dev(host, data), + !(data->flags & MMC_DATA_WRITE)); + + omap_start_dma(dma_ch); +} + /* * DMA call back function */ @@ -605,6 +655,14 @@ static void mmc_omap_dma_cb(int lch, u16 ch_status, void *data) if (host->dma_ch < 0) return; + host->dma_sg_idx++; + if (host->dma_sg_idx < host->dma_len) { + /* Fire up the next transfer. */ + mmc_omap_config_dma_params(host, host->data, + host->data->sg + host->dma_sg_idx); + return; + } + omap_free_dma(host->dma_ch); host->dma_ch = -1; /* @@ -614,39 +672,29 @@ static void mmc_omap_dma_cb(int lch, u16 ch_status, void *data) up(&host->sem); } -/* - * Configure dma src and destination parameters - */ -static int mmc_omap_config_dma_param(int sync_dir, struct mmc_omap_host *host, - struct mmc_data *data) -{ - if (sync_dir == 0) { - omap_set_dma_dest_params(host->dma_ch, 0, - OMAP_DMA_AMODE_CONSTANT, - (host->mapbase + OMAP_HSMMC_DATA), 0, 0); - omap_set_dma_src_params(host->dma_ch, 0, - OMAP_DMA_AMODE_POST_INC, - sg_dma_address(&data->sg[0]), 0, 0); - } else { - omap_set_dma_src_params(host->dma_ch, 0, - OMAP_DMA_AMODE_CONSTANT, - (host->mapbase + OMAP_HSMMC_DATA), 0, 0); - omap_set_dma_dest_params(host->dma_ch, 0, - OMAP_DMA_AMODE_POST_INC, - sg_dma_address(&data->sg[0]), 0, 0); - } - return 0; -} /* * Routine to configure and start DMA for the MMC card */ static int mmc_omap_start_dma_transfer(struct mmc_omap_host *host, struct mmc_request *req) { - int sync_dev, sync_dir = 0; - int dma_ch = 0, ret = 0, err = 1; + int dma_ch = 0, ret = 0, err = 1, i; struct mmc_data *data = req->data; + /* Sanity check: all the SG entries must be aligned by block size. */ + for (i = 0; i < host->dma_len; i++) { + struct scatterlist *sgl; + + sgl = data->sg + i; + if (sgl->length % data->blksz) + return -EINVAL; + } + if ((data->blksz % 4) != 0) + /* REVISIT: The MMC buffer increments only when MSB is written. + * Return error for blksz which is non multiple of four. + */ + return -EINVAL; + /* * If for some reason the DMA transfer is still active, * we wait for timeout period and free the dma @@ -665,49 +713,22 @@ mmc_omap_start_dma_transfer(struct mmc_omap_host *host, struct mmc_request *req) return err; } - if (!(data->flags & MMC_DATA_WRITE)) { - host->dma_dir = DMA_FROM_DEVICE; - if (host->id == OMAP_MMC1_DEVID) - sync_dev = OMAP24XX_DMA_MMC1_RX; - else - sync_dev = OMAP24XX_DMA_MMC2_RX; - } else { - host->dma_dir = DMA_TO_DEVICE; - if (host->id == OMAP_MMC1_DEVID) - sync_dev = OMAP24XX_DMA_MMC1_TX; - else - sync_dev = OMAP24XX_DMA_MMC2_TX; - } - - ret = omap_request_dma(sync_dev, "MMC/SD", mmc_omap_dma_cb, - host, &dma_ch); + ret = omap_request_dma(mmc_omap_get_dma_sync_dev(host, data), "MMC/SD", + mmc_omap_dma_cb,host, &dma_ch); if (ret != 0) { - dev_dbg(mmc_dev(host->mmc), + dev_err(mmc_dev(host->mmc), "%s: omap_request_dma() failed with %d\n", mmc_hostname(host->mmc), ret); return ret; } host->dma_len = dma_map_sg(mmc_dev(host->mmc), data->sg, - data->sg_len, host->dma_dir); + data->sg_len, mmc_omap_get_dma_dir(host, data)); host->dma_ch = dma_ch; + host->dma_sg_idx = 0; - if (!(data->flags & MMC_DATA_WRITE)) - mmc_omap_config_dma_param(1, host, data); - else - mmc_omap_config_dma_param(0, host, data); - - if ((data->blksz % 4) == 0) - omap_set_dma_transfer_params(dma_ch, OMAP_DMA_DATA_TYPE_S32, - (data->blksz / 4), data->blocks, OMAP_DMA_SYNC_FRAME, - sync_dev, sync_dir); - else - /* REVISIT: The MMC buffer increments only when MSB is written. - * Return error for blksz which is non multiple of four. - */ - return -EINVAL; + mmc_omap_config_dma_params(host, data, data->sg); - omap_start_dma(dma_ch); return 0; } @@ -757,7 +778,6 @@ mmc_omap_prepare_data(struct mmc_omap_host *host, struct mmc_request *req) host->data = req->data; if (req->data == NULL) { - host->datadir = OMAP_MMC_DATADIR_NONE; OMAP_HSMMC_WRITE(host->base, BLK, 0); return 0; } @@ -766,9 +786,6 @@ mmc_omap_prepare_data(struct mmc_omap_host *host, struct mmc_request *req) | (req->data->blocks << 16)); set_data_timeout(host, req); - host->datadir = (req->data->flags & MMC_DATA_WRITE) ? - OMAP_MMC_DATADIR_WRITE : OMAP_MMC_DATADIR_READ; - if (host->use_dma) { ret = mmc_omap_start_dma_transfer(host, req); if (ret != 0) { @@ -1027,10 +1044,11 @@ static int __init omap_mmc_probe(struct platform_device *pdev) else host->dbclk_enabled = 1; -#ifdef CONFIG_MMC_BLOCK_BOUNCE - mmc->max_phys_segs = 1; - mmc->max_hw_segs = 1; -#endif + /* Since we do only SG emulation, we can have as many segs + * as we want. */ + mmc->max_phys_segs = 1024; + mmc->max_hw_segs = 1024; + mmc->max_blk_size = 512; /* Block Length at max can be 1024 */ mmc->max_blk_count = 0xFFFF; /* No. of Blocks is 16 bits */ mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count; -- cgit v1.2.3-59-g8ed1b From 731530104afa6310660455ad86353dbe9e226740 Mon Sep 17 00:00:00 2001 From: Jarkko Lavinen Date: Fri, 21 Nov 2008 16:49:54 +0200 Subject: omap_hsmmc: Add 8-bit bus width mode support Signed-off-by: Jarkko Lavinen Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 483e591f7bbb..fd1657ad2a74 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -77,6 +77,7 @@ #define MSBS (1 << 5) #define BCE (1 << 1) #define FOUR_BIT (1 << 1) +#define DW8 (1 << 5) #define CC 0x1 #define TC 0x02 #define OD 0x1 @@ -817,6 +818,7 @@ static void omap_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) u16 dsor = 0; unsigned long regval; unsigned long timeout; + u32 con; switch (ios->power_mode) { case MMC_POWER_OFF: @@ -827,12 +829,18 @@ static void omap_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) break; } + con = OMAP_HSMMC_READ(host->base, CON); switch (mmc->ios.bus_width) { + case MMC_BUS_WIDTH_8: + OMAP_HSMMC_WRITE(host->base, CON, con | DW8); + break; case MMC_BUS_WIDTH_4: + OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) | FOUR_BIT); break; case MMC_BUS_WIDTH_1: + OMAP_HSMMC_WRITE(host->base, CON, con & ~DW8); OMAP_HSMMC_WRITE(host->base, HCTL, OMAP_HSMMC_READ(host->base, HCTL) & ~FOUR_BIT); break; @@ -1057,7 +1065,9 @@ static int __init omap_mmc_probe(struct platform_device *pdev) mmc->ocr_avail = mmc_slot(host).ocr_mask; mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED; - if (pdata->slots[host->slot_id].wires >= 4) + if (pdata->slots[host->slot_id].wires >= 8) + mmc->caps |= MMC_CAP_8_BIT_DATA; + else if (pdata->slots[host->slot_id].wires >= 4) mmc->caps |= MMC_CAP_4_BIT_DATA; omap_hsmmc_init(host); -- cgit v1.2.3-59-g8ed1b From e1a55f5eeae90de3f1113dea8cd40d54e1562abf Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 26 Jan 2009 13:17:25 +0200 Subject: omap_hsmmc: Allow cover switch to cause rescan Allow a cover switch to be used to cause a rescan of the MMC slot. Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index fd1657ad2a74..f3e4e0fd6655 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -571,7 +571,10 @@ static void mmc_omap_detect(struct work_struct *work) mmc_carddetect_work); struct omap_mmc_slot_data *slot = &mmc_slot(host); - host->carddetect = slot->card_detect(slot->card_detect_irq); + if (mmc_slot(host).card_detect) + host->carddetect = slot->card_detect(slot->card_detect_irq); + else + host->carddetect = -ENOSYS; sysfs_notify(&host->mmc->class_dev.kobj, NULL, "cover_switch"); if (host->carddetect) { @@ -1089,7 +1092,7 @@ static int __init omap_mmc_probe(struct platform_device *pdev) } /* Request IRQ for card detect */ - if ((mmc_slot(host).card_detect_irq) && (mmc_slot(host).card_detect)) { + if ((mmc_slot(host).card_detect_irq)) { ret = request_irq(mmc_slot(host).card_detect_irq, omap_mmc_cd_handler, IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING @@ -1112,8 +1115,8 @@ static int __init omap_mmc_probe(struct platform_device *pdev) if (ret < 0) goto err_slot_name; } - if (mmc_slot(host).card_detect_irq && mmc_slot(host).card_detect && - host->pdata->slots[host->slot_id].get_cover_state) { + if (mmc_slot(host).card_detect_irq && + host->pdata->slots[host->slot_id].get_cover_state) { ret = device_create_file(&mmc->class_dev, &dev_attr_cover_switch); if (ret < 0) -- cgit v1.2.3-59-g8ed1b From e68fdabc0d27f25428d53d43caae8bc79ed63d32 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 30 Jan 2009 10:59:31 +0200 Subject: omap_hsmmc: Do not prefix slot name Allow slot_name to be the same as the other OMAP driver, by removing the redundant "slot:" prefix. Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index f3e4e0fd6655..7a9ca4f22405 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -216,7 +216,7 @@ mmc_omap_show_slot_name(struct device *dev, struct device_attribute *attr, struct mmc_omap_host *host = mmc_priv(mmc); struct omap_mmc_slot_data slot = host->pdata->slots[host->slot_id]; - return sprintf(buf, "slot:%s\n", slot.name); + return sprintf(buf, "%s\n", slot.name); } static DEVICE_ATTR(slot_name, S_IRUGO, mmc_omap_show_slot_name, NULL); -- cgit v1.2.3-59-g8ed1b From 0683af4887bf61b5285c4e08cad5c7d110fbd605 Mon Sep 17 00:00:00 2001 From: Jarkko Lavinen Date: Thu, 12 Mar 2009 15:30:58 +0200 Subject: omap_hsmmc: Disable SDBP at suspend Turn off the bus power at suspend. Signed-off-by: Jarkko Lavinen Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 7a9ca4f22405..283265154944 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1214,20 +1214,8 @@ static int omap_mmc_suspend(struct platform_device *pdev, pm_message_t state) " level suspend\n"); } - if (host->id == OMAP_MMC1_DEVID - && !(OMAP_HSMMC_READ(host->base, HCTL) - & SDVSDET)) { - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) - & SDVSCLR); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) - | SDVS30); - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) - | SDBP); - } - + OMAP_HSMMC_WRITE(host->base, HCTL, + OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP); clk_disable(host->fclk); clk_disable(host->iclk); clk_disable(host->dbclk); -- cgit v1.2.3-59-g8ed1b From f3e2f1dd3bbe9352654eec8223495d35e1f52af2 Mon Sep 17 00:00:00 2001 From: Grazvydas Ignotas Date: Sat, 3 Jan 2009 10:36:13 +0000 Subject: omap_hsmmc: Fix MMC3 dma Data transfers on third OMAP3 MMC controller don't work because DMA line numbers are only defined for MMC1 and MMC2. Fix that and store line numbers in mmc_omap_host structure to reduce code size. Tested on OMAP3 pandora board. Signed-off-by: Grazvydas Ignotas Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 283265154944..61883093e25b 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -100,6 +100,7 @@ */ #define OMAP_MMC1_DEVID 0 #define OMAP_MMC2_DEVID 1 +#define OMAP_MMC3_DEVID 2 #define MMC_TIMEOUT_MS 20 #define OMAP_MMC_MASTER_CLOCK 96000000 @@ -144,6 +145,7 @@ struct mmc_omap_host { int irq; int carddetect; int use_dma, dma_ch; + int dma_line_tx, dma_line_rx; int slot_id; int dbclk_enabled; int response_busy; @@ -602,17 +604,10 @@ static int mmc_omap_get_dma_sync_dev(struct mmc_omap_host *host, { int sync_dev; - if (data->flags & MMC_DATA_WRITE) { - if (host->id == OMAP_MMC1_DEVID) - sync_dev = OMAP24XX_DMA_MMC1_TX; - else - sync_dev = OMAP24XX_DMA_MMC2_TX; - } else { - if (host->id == OMAP_MMC1_DEVID) - sync_dev = OMAP24XX_DMA_MMC1_RX; - else - sync_dev = OMAP24XX_DMA_MMC2_RX; - } + if (data->flags & MMC_DATA_WRITE) + sync_dev = host->dma_line_tx; + else + sync_dev = host->dma_line_rx; return sync_dev; } @@ -1075,6 +1070,25 @@ static int __init omap_mmc_probe(struct platform_device *pdev) omap_hsmmc_init(host); + /* Select DMA lines */ + switch (host->id) { + case OMAP_MMC1_DEVID: + host->dma_line_tx = OMAP24XX_DMA_MMC1_TX; + host->dma_line_rx = OMAP24XX_DMA_MMC1_RX; + break; + case OMAP_MMC2_DEVID: + host->dma_line_tx = OMAP24XX_DMA_MMC2_TX; + host->dma_line_rx = OMAP24XX_DMA_MMC2_RX; + break; + case OMAP_MMC3_DEVID: + host->dma_line_tx = OMAP34XX_DMA_MMC3_TX; + host->dma_line_rx = OMAP34XX_DMA_MMC3_RX; + break; + default: + dev_err(mmc_dev(host->mmc), "Invalid MMC id\n"); + goto err_irq; + } + /* Request IRQ for MMC operations */ ret = request_irq(host->irq, mmc_omap_irq, IRQF_DISABLED, mmc_hostname(mmc), host); -- cgit v1.2.3-59-g8ed1b From e13bb3003a33df8f82cd027f8abfa5cd73f2eec0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 12 Mar 2009 17:08:26 +0200 Subject: omap_hsmmc: Wait for SDBP It is necessary to wait for bus power before sending any commands. Signed-off-by: Adrian Hunter Signed-off-by: Pierre Ossman --- drivers/mmc/host/omap_hsmmc.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 61883093e25b..488f222054f8 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -498,6 +498,19 @@ static irqreturn_t mmc_omap_irq(int irq, void *dev_id) return IRQ_HANDLED; } +static void set_sd_bus_power(struct mmc_omap_host *host) +{ + unsigned long i; + + OMAP_HSMMC_WRITE(host->base, HCTL, + OMAP_HSMMC_READ(host->base, HCTL) | SDBP); + for (i = 0; i < loops_per_jiffy; i++) { + if (OMAP_HSMMC_READ(host->base, HCTL) & SDBP) + break; + cpu_relax(); + } +} + /* * Switch MMC interface voltage ... only relevant for MMC1. * @@ -554,9 +567,7 @@ static int omap_mmc_switch_opcond(struct mmc_omap_host *host, int vdd) reg_val |= SDVS30; OMAP_HSMMC_WRITE(host->base, HCTL, reg_val); - - OMAP_HSMMC_WRITE(host->base, HCTL, - OMAP_HSMMC_READ(host->base, HCTL) | SDBP); + set_sd_bus_power(host); return 0; err: @@ -942,8 +953,7 @@ static void omap_hsmmc_init(struct mmc_omap_host *host) OMAP_HSMMC_WRITE(host->base, SYSCONFIG, value | AUTOIDLE); /* Set SD bus power bit */ - value = OMAP_HSMMC_READ(host->base, HCTL); - OMAP_HSMMC_WRITE(host->base, HCTL, value | SDBP); + set_sd_bus_power(host); } static struct mmc_host_ops mmc_omap_ops = { -- cgit v1.2.3-59-g8ed1b From f079a8fc61e3dc35830f6abc58c21ae815ab4297 Mon Sep 17 00:00:00 2001 From: Wolfgang Muees Date: Mon, 16 Mar 2009 12:23:03 +0100 Subject: mmc_spi: adjust for delayed data token response Some cards are not able to send the data token in time, but miss the time frame for some bits(!). So synchronize to the start of the token. Signed-off-by: Wolfgang Muees Acked-by: David Brownell Signed-off-by: Pierre Ossman --- drivers/mmc/host/mmc_spi.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 69e1442ff2e2..72f8bde4877a 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -612,6 +612,7 @@ mmc_spi_writeblock(struct mmc_spi_host *host, struct spi_transfer *t, struct spi_device *spi = host->spi; int status, i; struct scratch *scratch = host->data; + u32 pattern; if (host->mmc->use_spi_crc) scratch->crc_val = cpu_to_be16( @@ -639,8 +640,27 @@ mmc_spi_writeblock(struct mmc_spi_host *host, struct spi_transfer *t, * doesn't necessarily tell whether the write operation succeeded; * it just says if the transmission was ok and whether *earlier* * writes succeeded; see the standard. + * + * In practice, there are (even modern SDHC-)cards which are late + * in sending the response, and miss the time frame by a few bits, + * so we have to cope with this situation and check the response + * bit-by-bit. Arggh!!! */ - switch (SPI_MMC_RESPONSE_CODE(scratch->status[0])) { + pattern = scratch->status[0] << 24; + pattern |= scratch->status[1] << 16; + pattern |= scratch->status[2] << 8; + pattern |= scratch->status[3]; + + /* First 3 bit of pattern are undefined */ + pattern |= 0xE0000000; + + /* left-adjust to leading 0 bit */ + while (pattern & 0x80000000) + pattern <<= 1; + /* right-adjust for pattern matching. Code is in bit 4..0 now. */ + pattern >>= 27; + + switch (pattern) { case SPI_RESPONSE_ACCEPTED: status = 0; break; @@ -671,8 +691,9 @@ mmc_spi_writeblock(struct mmc_spi_host *host, struct spi_transfer *t, /* Return when not busy. If we didn't collect that status yet, * we'll need some more I/O. */ - for (i = 1; i < sizeof(scratch->status); i++) { - if (scratch->status[i] != 0) + for (i = 4; i < sizeof(scratch->status); i++) { + /* card is non-busy if the most recent bit is 1 */ + if (scratch->status[i] & 0x01) return 0; } return mmc_spi_wait_unbusy(host, timeout); -- cgit v1.2.3-59-g8ed1b From 4e4141a526dd7f5ac3ce1458ae79ea6e5a515b06 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:46 +0300 Subject: sdhci: Add support for bus-specific IO memory accessors Currently the SDHCI driver works with PCI accessors (write{l,b,w} and read{l,b,w}). With this patch drivers may change memory accessors, so that we can support hosts with "weird" IO memory access requirments. For example, in "FSL eSDHC" SDHCI hardware all registers are 32 bit width, with big-endian addressing. That is, readb(0x2f) should turn into readb(0x2c), and readw(0x2c) should be translated to le16_to_cpu(readw(0x2e)). Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/Kconfig | 7 +++ drivers/mmc/host/sdhci.c | 159 +++++++++++++++++++++++------------------------ drivers/mmc/host/sdhci.h | 95 ++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 81 deletions(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 092baf66733c..e8a7b50a01e7 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -37,6 +37,13 @@ config MMC_SDHCI If unsure, say N. +config MMC_SDHCI_IO_ACCESSORS + bool + depends on MMC_SDHCI + help + This is silent Kconfig symbol that is selected by the drivers that + need to overwrite SDHCI IO memory accessors. + config MMC_SDHCI_PCI tristate "SDHCI support on PCI bus" depends on MMC_SDHCI && PCI diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index accb592764ed..fd36b822f809 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -48,35 +48,35 @@ static void sdhci_dumpregs(struct sdhci_host *host) printk(KERN_DEBUG DRIVER_NAME ": ============== REGISTER DUMP ==============\n"); printk(KERN_DEBUG DRIVER_NAME ": Sys addr: 0x%08x | Version: 0x%08x\n", - readl(host->ioaddr + SDHCI_DMA_ADDRESS), - readw(host->ioaddr + SDHCI_HOST_VERSION)); + sdhci_readl(host, SDHCI_DMA_ADDRESS), + sdhci_readw(host, SDHCI_HOST_VERSION)); printk(KERN_DEBUG DRIVER_NAME ": Blk size: 0x%08x | Blk cnt: 0x%08x\n", - readw(host->ioaddr + SDHCI_BLOCK_SIZE), - readw(host->ioaddr + SDHCI_BLOCK_COUNT)); + sdhci_readw(host, SDHCI_BLOCK_SIZE), + sdhci_readw(host, SDHCI_BLOCK_COUNT)); printk(KERN_DEBUG DRIVER_NAME ": Argument: 0x%08x | Trn mode: 0x%08x\n", - readl(host->ioaddr + SDHCI_ARGUMENT), - readw(host->ioaddr + SDHCI_TRANSFER_MODE)); + sdhci_readl(host, SDHCI_ARGUMENT), + sdhci_readw(host, SDHCI_TRANSFER_MODE)); printk(KERN_DEBUG DRIVER_NAME ": Present: 0x%08x | Host ctl: 0x%08x\n", - readl(host->ioaddr + SDHCI_PRESENT_STATE), - readb(host->ioaddr + SDHCI_HOST_CONTROL)); + sdhci_readl(host, SDHCI_PRESENT_STATE), + sdhci_readb(host, SDHCI_HOST_CONTROL)); printk(KERN_DEBUG DRIVER_NAME ": Power: 0x%08x | Blk gap: 0x%08x\n", - readb(host->ioaddr + SDHCI_POWER_CONTROL), - readb(host->ioaddr + SDHCI_BLOCK_GAP_CONTROL)); + sdhci_readb(host, SDHCI_POWER_CONTROL), + sdhci_readb(host, SDHCI_BLOCK_GAP_CONTROL)); printk(KERN_DEBUG DRIVER_NAME ": Wake-up: 0x%08x | Clock: 0x%08x\n", - readb(host->ioaddr + SDHCI_WAKE_UP_CONTROL), - readw(host->ioaddr + SDHCI_CLOCK_CONTROL)); + sdhci_readb(host, SDHCI_WAKE_UP_CONTROL), + sdhci_readw(host, SDHCI_CLOCK_CONTROL)); printk(KERN_DEBUG DRIVER_NAME ": Timeout: 0x%08x | Int stat: 0x%08x\n", - readb(host->ioaddr + SDHCI_TIMEOUT_CONTROL), - readl(host->ioaddr + SDHCI_INT_STATUS)); + sdhci_readb(host, SDHCI_TIMEOUT_CONTROL), + sdhci_readl(host, SDHCI_INT_STATUS)); printk(KERN_DEBUG DRIVER_NAME ": Int enab: 0x%08x | Sig enab: 0x%08x\n", - readl(host->ioaddr + SDHCI_INT_ENABLE), - readl(host->ioaddr + SDHCI_SIGNAL_ENABLE)); + sdhci_readl(host, SDHCI_INT_ENABLE), + sdhci_readl(host, SDHCI_SIGNAL_ENABLE)); printk(KERN_DEBUG DRIVER_NAME ": AC12 err: 0x%08x | Slot int: 0x%08x\n", - readw(host->ioaddr + SDHCI_ACMD12_ERR), - readw(host->ioaddr + SDHCI_SLOT_INT_STATUS)); + sdhci_readw(host, SDHCI_ACMD12_ERR), + sdhci_readw(host, SDHCI_SLOT_INT_STATUS)); printk(KERN_DEBUG DRIVER_NAME ": Caps: 0x%08x | Max curr: 0x%08x\n", - readl(host->ioaddr + SDHCI_CAPABILITIES), - readl(host->ioaddr + SDHCI_MAX_CURRENT)); + sdhci_readl(host, SDHCI_CAPABILITIES), + sdhci_readl(host, SDHCI_MAX_CURRENT)); printk(KERN_DEBUG DRIVER_NAME ": ===========================================\n"); } @@ -92,12 +92,12 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) unsigned long timeout; if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) { - if (!(readl(host->ioaddr + SDHCI_PRESENT_STATE) & + if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT)) return; } - writeb(mask, host->ioaddr + SDHCI_SOFTWARE_RESET); + sdhci_writeb(host, mask, SDHCI_SOFTWARE_RESET); if (mask & SDHCI_RESET_ALL) host->clock = 0; @@ -106,7 +106,7 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) timeout = 100; /* hw clears the bit when it's done */ - while (readb(host->ioaddr + SDHCI_SOFTWARE_RESET) & mask) { + while (sdhci_readb(host, SDHCI_SOFTWARE_RESET) & mask) { if (timeout == 0) { printk(KERN_ERR "%s: Reset 0x%x never completed.\n", mmc_hostname(host->mmc), (int)mask); @@ -132,26 +132,26 @@ static void sdhci_init(struct sdhci_host *host) SDHCI_INT_DMA_END | SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE | SDHCI_INT_ADMA_ERROR; - writel(intmask, host->ioaddr + SDHCI_INT_ENABLE); - writel(intmask, host->ioaddr + SDHCI_SIGNAL_ENABLE); + sdhci_writel(host, intmask, SDHCI_INT_ENABLE); + sdhci_writel(host, intmask, SDHCI_SIGNAL_ENABLE); } static void sdhci_activate_led(struct sdhci_host *host) { u8 ctrl; - ctrl = readb(host->ioaddr + SDHCI_HOST_CONTROL); + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); ctrl |= SDHCI_CTRL_LED; - writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); } static void sdhci_deactivate_led(struct sdhci_host *host) { u8 ctrl; - ctrl = readb(host->ioaddr + SDHCI_HOST_CONTROL); + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); ctrl &= ~SDHCI_CTRL_LED; - writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); } #ifdef SDHCI_USE_LEDS_CLASS @@ -205,7 +205,7 @@ static void sdhci_read_block_pio(struct sdhci_host *host) while (len) { if (chunk == 0) { - scratch = readl(host->ioaddr + SDHCI_BUFFER); + scratch = sdhci_readl(host, SDHCI_BUFFER); chunk = 4; } @@ -257,7 +257,7 @@ static void sdhci_write_block_pio(struct sdhci_host *host) len--; if ((chunk == 4) || ((len == 0) && (blksize == 0))) { - writel(scratch, host->ioaddr + SDHCI_BUFFER); + sdhci_writel(host, scratch, SDHCI_BUFFER); chunk = 0; scratch = 0; } @@ -292,7 +292,7 @@ static void sdhci_transfer_pio(struct sdhci_host *host) (host->data->blocks == 1)) mask = ~0; - while (readl(host->ioaddr + SDHCI_PRESENT_STATE) & mask) { + while (sdhci_readl(host, SDHCI_PRESENT_STATE) & mask) { if (host->data->flags & MMC_DATA_READ) sdhci_read_block_pio(host); else @@ -581,7 +581,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) host->data_early = 0; count = sdhci_calc_timeout(host, data); - writeb(count, host->ioaddr + SDHCI_TIMEOUT_CONTROL); + sdhci_writeb(host, count, SDHCI_TIMEOUT_CONTROL); if (host->flags & SDHCI_USE_DMA) host->flags |= SDHCI_REQ_USE_DMA; @@ -661,8 +661,8 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) WARN_ON(1); host->flags &= ~SDHCI_REQ_USE_DMA; } else { - writel(host->adma_addr, - host->ioaddr + SDHCI_ADMA_ADDRESS); + sdhci_writel(host, host->adma_addr, + SDHCI_ADMA_ADDRESS); } } else { int sg_cnt; @@ -681,8 +681,8 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) host->flags &= ~SDHCI_REQ_USE_DMA; } else { WARN_ON(sg_cnt != 1); - writel(sg_dma_address(data->sg), - host->ioaddr + SDHCI_DMA_ADDRESS); + sdhci_writel(host, sg_dma_address(data->sg), + SDHCI_DMA_ADDRESS); } } } @@ -693,14 +693,14 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) * is ADMA. */ if (host->version >= SDHCI_SPEC_200) { - ctrl = readb(host->ioaddr + SDHCI_HOST_CONTROL); + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); ctrl &= ~SDHCI_CTRL_DMA_MASK; if ((host->flags & SDHCI_REQ_USE_DMA) && (host->flags & SDHCI_USE_ADMA)) ctrl |= SDHCI_CTRL_ADMA32; else ctrl |= SDHCI_CTRL_SDMA; - writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); } if (!(host->flags & SDHCI_REQ_USE_DMA)) { @@ -710,9 +710,8 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) } /* We do not handle DMA boundaries, so set it to max (512 KiB) */ - writew(SDHCI_MAKE_BLKSZ(7, data->blksz), - host->ioaddr + SDHCI_BLOCK_SIZE); - writew(data->blocks, host->ioaddr + SDHCI_BLOCK_COUNT); + sdhci_writew(host, SDHCI_MAKE_BLKSZ(7, data->blksz), SDHCI_BLOCK_SIZE); + sdhci_writew(host, data->blocks, SDHCI_BLOCK_COUNT); } static void sdhci_set_transfer_mode(struct sdhci_host *host, @@ -733,7 +732,7 @@ static void sdhci_set_transfer_mode(struct sdhci_host *host, if (host->flags & SDHCI_REQ_USE_DMA) mode |= SDHCI_TRNS_DMA; - writew(mode, host->ioaddr + SDHCI_TRANSFER_MODE); + sdhci_writew(host, mode, SDHCI_TRANSFER_MODE); } static void sdhci_finish_data(struct sdhci_host *host) @@ -802,7 +801,7 @@ static void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd) if (host->mrq->data && (cmd == host->mrq->data->stop)) mask &= ~SDHCI_DATA_INHIBIT; - while (readl(host->ioaddr + SDHCI_PRESENT_STATE) & mask) { + while (sdhci_readl(host, SDHCI_PRESENT_STATE) & mask) { if (timeout == 0) { printk(KERN_ERR "%s: Controller never released " "inhibit bit(s).\n", mmc_hostname(host->mmc)); @@ -821,7 +820,7 @@ static void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd) sdhci_prepare_data(host, cmd->data); - writel(cmd->arg, host->ioaddr + SDHCI_ARGUMENT); + sdhci_writel(host, cmd->arg, SDHCI_ARGUMENT); sdhci_set_transfer_mode(host, cmd->data); @@ -849,8 +848,7 @@ static void sdhci_send_command(struct sdhci_host *host, struct mmc_command *cmd) if (cmd->data) flags |= SDHCI_CMD_DATA; - writew(SDHCI_MAKE_CMD(cmd->opcode, flags), - host->ioaddr + SDHCI_COMMAND); + sdhci_writew(host, SDHCI_MAKE_CMD(cmd->opcode, flags), SDHCI_COMMAND); } static void sdhci_finish_command(struct sdhci_host *host) @@ -863,15 +861,15 @@ static void sdhci_finish_command(struct sdhci_host *host) if (host->cmd->flags & MMC_RSP_136) { /* CRC is stripped so we need to do some shifting. */ for (i = 0;i < 4;i++) { - host->cmd->resp[i] = readl(host->ioaddr + + host->cmd->resp[i] = sdhci_readl(host, SDHCI_RESPONSE + (3-i)*4) << 8; if (i != 3) host->cmd->resp[i] |= - readb(host->ioaddr + + sdhci_readb(host, SDHCI_RESPONSE + (3-i)*4-1); } } else { - host->cmd->resp[0] = readl(host->ioaddr + SDHCI_RESPONSE); + host->cmd->resp[0] = sdhci_readl(host, SDHCI_RESPONSE); } } @@ -895,7 +893,7 @@ static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) if (clock == host->clock) return; - writew(0, host->ioaddr + SDHCI_CLOCK_CONTROL); + sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); if (clock == 0) goto out; @@ -908,11 +906,11 @@ static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) clk = div << SDHCI_DIVIDER_SHIFT; clk |= SDHCI_CLOCK_INT_EN; - writew(clk, host->ioaddr + SDHCI_CLOCK_CONTROL); + sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); /* Wait max 10 ms */ timeout = 10; - while (!((clk = readw(host->ioaddr + SDHCI_CLOCK_CONTROL)) + while (!((clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL)) & SDHCI_CLOCK_INT_STABLE)) { if (timeout == 0) { printk(KERN_ERR "%s: Internal clock never " @@ -925,7 +923,7 @@ static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) } clk |= SDHCI_CLOCK_CARD_EN; - writew(clk, host->ioaddr + SDHCI_CLOCK_CONTROL); + sdhci_writew(host, clk, SDHCI_CLOCK_CONTROL); out: host->clock = clock; @@ -939,7 +937,7 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned short power) return; if (power == (unsigned short)-1) { - writeb(0, host->ioaddr + SDHCI_POWER_CONTROL); + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); goto out; } @@ -948,7 +946,7 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned short power) * a new value. Some controllers don't seem to like this though. */ if (!(host->quirks & SDHCI_QUIRK_SINGLE_POWER_WRITE)) - writeb(0, host->ioaddr + SDHCI_POWER_CONTROL); + sdhci_writeb(host, 0, SDHCI_POWER_CONTROL); pwr = SDHCI_POWER_ON; @@ -973,10 +971,9 @@ static void sdhci_set_power(struct sdhci_host *host, unsigned short power) * and set turn on power at the same time, so set the voltage first. */ if ((host->quirks & SDHCI_QUIRK_NO_SIMULT_VDD_AND_POWER)) - writeb(pwr & ~SDHCI_POWER_ON, - host->ioaddr + SDHCI_POWER_CONTROL); + sdhci_writeb(host, pwr & ~SDHCI_POWER_ON, SDHCI_POWER_CONTROL); - writeb(pwr, host->ioaddr + SDHCI_POWER_CONTROL); + sdhci_writeb(host, pwr, SDHCI_POWER_CONTROL); out: host->power = power; @@ -1005,7 +1002,7 @@ static void sdhci_request(struct mmc_host *mmc, struct mmc_request *mrq) host->mrq = mrq; - if (!(readl(host->ioaddr + SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT) + if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT) || (host->flags & SDHCI_DEVICE_DEAD)) { host->mrq->cmd->error = -ENOMEDIUM; tasklet_schedule(&host->finish_tasklet); @@ -1034,7 +1031,7 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) * Should clear out any weird states. */ if (ios->power_mode == MMC_POWER_OFF) { - writel(0, host->ioaddr + SDHCI_SIGNAL_ENABLE); + sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE); sdhci_init(host); } @@ -1045,7 +1042,7 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else sdhci_set_power(host, ios->vdd); - ctrl = readb(host->ioaddr + SDHCI_HOST_CONTROL); + ctrl = sdhci_readb(host, SDHCI_HOST_CONTROL); if (ios->bus_width == MMC_BUS_WIDTH_4) ctrl |= SDHCI_CTRL_4BITBUS; @@ -1057,7 +1054,7 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) else ctrl &= ~SDHCI_CTRL_HISPD; - writeb(ctrl, host->ioaddr + SDHCI_HOST_CONTROL); + sdhci_writeb(host, ctrl, SDHCI_HOST_CONTROL); /* * Some (ENE) controllers go apeshit on some ios operation, @@ -1085,7 +1082,7 @@ static int sdhci_get_ro(struct mmc_host *mmc) if (host->flags & SDHCI_DEVICE_DEAD) present = 0; else - present = readl(host->ioaddr + SDHCI_PRESENT_STATE); + present = sdhci_readl(host, SDHCI_PRESENT_STATE); spin_unlock_irqrestore(&host->lock, flags); @@ -1105,14 +1102,14 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) if (host->flags & SDHCI_DEVICE_DEAD) goto out; - ier = readl(host->ioaddr + SDHCI_INT_ENABLE); + ier = sdhci_readl(host, SDHCI_INT_ENABLE); ier &= ~SDHCI_INT_CARD_INT; if (enable) ier |= SDHCI_INT_CARD_INT; - writel(ier, host->ioaddr + SDHCI_INT_ENABLE); - writel(ier, host->ioaddr + SDHCI_SIGNAL_ENABLE); + sdhci_writel(host, ier, SDHCI_INT_ENABLE); + sdhci_writel(host, ier, SDHCI_SIGNAL_ENABLE); out: mmiowb(); @@ -1142,7 +1139,7 @@ static void sdhci_tasklet_card(unsigned long param) spin_lock_irqsave(&host->lock, flags); - if (!(readl(host->ioaddr + SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT)) { + if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT)) { if (host->mrq) { printk(KERN_ERR "%s: Card removed during transfer!\n", mmc_hostname(host->mmc)); @@ -1346,8 +1343,8 @@ static void sdhci_data_irq(struct sdhci_host *host, u32 intmask) * we need to at least restart the transfer. */ if (intmask & SDHCI_INT_DMA_END) - writel(readl(host->ioaddr + SDHCI_DMA_ADDRESS), - host->ioaddr + SDHCI_DMA_ADDRESS); + sdhci_writel(host, sdhci_readl(host, SDHCI_DMA_ADDRESS), + SDHCI_DMA_ADDRESS); if (intmask & SDHCI_INT_DATA_END) { if (host->cmd) { @@ -1373,7 +1370,7 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) spin_lock(&host->lock); - intmask = readl(host->ioaddr + SDHCI_INT_STATUS); + intmask = sdhci_readl(host, SDHCI_INT_STATUS); if (!intmask || intmask == 0xffffffff) { result = IRQ_NONE; @@ -1384,22 +1381,22 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) mmc_hostname(host->mmc), intmask); if (intmask & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE)) { - writel(intmask & (SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE), - host->ioaddr + SDHCI_INT_STATUS); + sdhci_writel(host, intmask & (SDHCI_INT_CARD_INSERT | + SDHCI_INT_CARD_REMOVE), SDHCI_INT_STATUS); tasklet_schedule(&host->card_tasklet); } intmask &= ~(SDHCI_INT_CARD_INSERT | SDHCI_INT_CARD_REMOVE); if (intmask & SDHCI_INT_CMD_MASK) { - writel(intmask & SDHCI_INT_CMD_MASK, - host->ioaddr + SDHCI_INT_STATUS); + sdhci_writel(host, intmask & SDHCI_INT_CMD_MASK, + SDHCI_INT_STATUS); sdhci_cmd_irq(host, intmask & SDHCI_INT_CMD_MASK); } if (intmask & SDHCI_INT_DATA_MASK) { - writel(intmask & SDHCI_INT_DATA_MASK, - host->ioaddr + SDHCI_INT_STATUS); + sdhci_writel(host, intmask & SDHCI_INT_DATA_MASK, + SDHCI_INT_STATUS); sdhci_data_irq(host, intmask & SDHCI_INT_DATA_MASK); } @@ -1410,7 +1407,7 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) if (intmask & SDHCI_INT_BUS_POWER) { printk(KERN_ERR "%s: Card is consuming too much power!\n", mmc_hostname(host->mmc)); - writel(SDHCI_INT_BUS_POWER, host->ioaddr + SDHCI_INT_STATUS); + sdhci_writel(host, SDHCI_INT_BUS_POWER, SDHCI_INT_STATUS); } intmask &= ~SDHCI_INT_BUS_POWER; @@ -1425,7 +1422,7 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) mmc_hostname(host->mmc), intmask); sdhci_dumpregs(host); - writel(intmask, host->ioaddr + SDHCI_INT_STATUS); + sdhci_writel(host, intmask, SDHCI_INT_STATUS); } result = IRQ_HANDLED; @@ -1537,7 +1534,7 @@ int sdhci_add_host(struct sdhci_host *host) sdhci_reset(host, SDHCI_RESET_ALL); - host->version = readw(host->ioaddr + SDHCI_HOST_VERSION); + host->version = sdhci_readw(host, SDHCI_HOST_VERSION); host->version = (host->version & SDHCI_SPEC_VER_MASK) >> SDHCI_SPEC_VER_SHIFT; if (host->version > SDHCI_SPEC_200) { @@ -1546,7 +1543,7 @@ int sdhci_add_host(struct sdhci_host *host) host->version); } - caps = readl(host->ioaddr + SDHCI_CAPABILITIES); + caps = sdhci_readl(host, SDHCI_CAPABILITIES); if (host->quirks & SDHCI_QUIRK_FORCE_DMA) host->flags |= SDHCI_USE_DMA; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 43c37c68d07a..d9733f841061 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -10,6 +10,9 @@ */ #include +#include +#include +#include /* * Controller registers @@ -267,9 +270,101 @@ struct sdhci_host { struct sdhci_ops { +#ifdef CONFIG_MMC_SDHCI_IO_ACCESSORS + u32 (*readl)(struct sdhci_host *host, int reg); + u16 (*readw)(struct sdhci_host *host, int reg); + u8 (*readb)(struct sdhci_host *host, int reg); + void (*writel)(struct sdhci_host *host, u32 val, int reg); + void (*writew)(struct sdhci_host *host, u16 val, int reg); + void (*writeb)(struct sdhci_host *host, u8 val, int reg); +#endif + int (*enable_dma)(struct sdhci_host *host); }; +#ifdef CONFIG_MMC_SDHCI_IO_ACCESSORS + +static inline void sdhci_writel(struct sdhci_host *host, u32 val, int reg) +{ + if (unlikely(host->ops->writel)) + host->ops->writel(host, val, reg); + else + writel(val, host->ioaddr + reg); +} + +static inline void sdhci_writew(struct sdhci_host *host, u16 val, int reg) +{ + if (unlikely(host->ops->writew)) + host->ops->writew(host, val, reg); + else + writew(val, host->ioaddr + reg); +} + +static inline void sdhci_writeb(struct sdhci_host *host, u8 val, int reg) +{ + if (unlikely(host->ops->writeb)) + host->ops->writeb(host, val, reg); + else + writeb(val, host->ioaddr + reg); +} + +static inline u32 sdhci_readl(struct sdhci_host *host, int reg) +{ + if (unlikely(host->ops->readl)) + return host->ops->readl(host, reg); + else + return readl(host->ioaddr + reg); +} + +static inline u16 sdhci_readw(struct sdhci_host *host, int reg) +{ + if (unlikely(host->ops->readw)) + return host->ops->readw(host, reg); + else + return readw(host->ioaddr + reg); +} + +static inline u8 sdhci_readb(struct sdhci_host *host, int reg) +{ + if (unlikely(host->ops->readb)) + return host->ops->readb(host, reg); + else + return readb(host->ioaddr + reg); +} + +#else + +static inline void sdhci_writel(struct sdhci_host *host, u32 val, int reg) +{ + writel(val, host->ioaddr + reg); +} + +static inline void sdhci_writew(struct sdhci_host *host, u16 val, int reg) +{ + writew(val, host->ioaddr + reg); +} + +static inline void sdhci_writeb(struct sdhci_host *host, u8 val, int reg) +{ + writeb(val, host->ioaddr + reg); +} + +static inline u32 sdhci_readl(struct sdhci_host *host, int reg) +{ + return readl(host->ioaddr + reg); +} + +static inline u16 sdhci_readw(struct sdhci_host *host, int reg) +{ + return readw(host->ioaddr + reg); +} + +static inline u8 sdhci_readb(struct sdhci_host *host, int reg) +{ + return readb(host->ioaddr + reg); +} + +#endif /* CONFIG_MMC_SDHCI_IO_ACCESSORS */ extern struct sdhci_host *sdhci_alloc_host(struct device *dev, size_t priv_size); -- cgit v1.2.3-59-g8ed1b From 7260cf5e12393536ce61d184c3fc750fb2ba635a Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:48 +0300 Subject: sdhci: Split card-detection IRQs management from sdhci_init() Card detection interrupts should be handled separately as they should not be enabled before mmc_add_host() returns and should be disabled before calling mmc_remove_host(). The same is for suspend and resume routines. sdhci_init() no longer enables card-detection irqs. Instead, two new functions implemented: sdhci_enable_card_detection() and sdhci_disable_card_detection(). New sdhci_reinit() call implemented to behave the same way as the old sdhci_init(). Also, this patch implements and uses few new helpers to manage IRQs in a more conveinient way, that is: - sdhci_clear_set_irqs() - sdhci_unmask_irqs() - sdhci_mask_irqs() - SDHCI_INT_ALL_MASK constant sdhci_enable_sdio_irq() converted to these new helpers, plus the helpers will be used by the subsequent patches. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 78 +++++++++++++++++++++++++++++++++++++----------- drivers/mmc/host/sdhci.h | 1 + 2 files changed, 62 insertions(+), 17 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index fd36b822f809..bd5acfb0c1b8 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -87,6 +87,47 @@ static void sdhci_dumpregs(struct sdhci_host *host) * * \*****************************************************************************/ +static void sdhci_clear_set_irqs(struct sdhci_host *host, u32 clear, u32 set) +{ + u32 ier; + + ier = sdhci_readl(host, SDHCI_INT_ENABLE); + ier &= ~clear; + ier |= set; + sdhci_writel(host, ier, SDHCI_INT_ENABLE); + sdhci_writel(host, ier, SDHCI_SIGNAL_ENABLE); +} + +static void sdhci_unmask_irqs(struct sdhci_host *host, u32 irqs) +{ + sdhci_clear_set_irqs(host, 0, irqs); +} + +static void sdhci_mask_irqs(struct sdhci_host *host, u32 irqs) +{ + sdhci_clear_set_irqs(host, irqs, 0); +} + +static void sdhci_set_card_detection(struct sdhci_host *host, bool enable) +{ + u32 irqs = SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT; + + if (enable) + sdhci_unmask_irqs(host, irqs); + else + sdhci_mask_irqs(host, irqs); +} + +static void sdhci_enable_card_detection(struct sdhci_host *host) +{ + sdhci_set_card_detection(host, true); +} + +static void sdhci_disable_card_detection(struct sdhci_host *host) +{ + sdhci_set_card_detection(host, false); +} + static void sdhci_reset(struct sdhci_host *host, u8 mask) { unsigned long timeout; @@ -120,20 +161,21 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) static void sdhci_init(struct sdhci_host *host) { - u32 intmask; - sdhci_reset(host, SDHCI_RESET_ALL); - intmask = SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT | + sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK, + SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC | SDHCI_INT_TIMEOUT | - SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT | SDHCI_INT_DATA_AVAIL | SDHCI_INT_SPACE_AVAIL | SDHCI_INT_DMA_END | SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE | - SDHCI_INT_ADMA_ERROR; + SDHCI_INT_ADMA_ERROR); +} - sdhci_writel(host, intmask, SDHCI_INT_ENABLE); - sdhci_writel(host, intmask, SDHCI_SIGNAL_ENABLE); +static void sdhci_reinit(struct sdhci_host *host) +{ + sdhci_init(host); + sdhci_enable_card_detection(host); } static void sdhci_activate_led(struct sdhci_host *host) @@ -1032,7 +1074,7 @@ static void sdhci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) */ if (ios->power_mode == MMC_POWER_OFF) { sdhci_writel(host, 0, SDHCI_SIGNAL_ENABLE); - sdhci_init(host); + sdhci_reinit(host); } sdhci_set_clock(host, ios->clock); @@ -1093,7 +1135,6 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) { struct sdhci_host *host; unsigned long flags; - u32 ier; host = mmc_priv(mmc); @@ -1102,15 +1143,10 @@ static void sdhci_enable_sdio_irq(struct mmc_host *mmc, int enable) if (host->flags & SDHCI_DEVICE_DEAD) goto out; - ier = sdhci_readl(host, SDHCI_INT_ENABLE); - - ier &= ~SDHCI_INT_CARD_INT; if (enable) - ier |= SDHCI_INT_CARD_INT; - - sdhci_writel(host, ier, SDHCI_INT_ENABLE); - sdhci_writel(host, ier, SDHCI_SIGNAL_ENABLE); - + sdhci_unmask_irqs(host, SDHCI_INT_CARD_INT); + else + sdhci_mask_irqs(host, SDHCI_INT_CARD_INT); out: mmiowb(); @@ -1452,6 +1488,8 @@ int sdhci_suspend_host(struct sdhci_host *host, pm_message_t state) { int ret; + sdhci_disable_card_detection(host); + ret = mmc_suspend_host(host->mmc, state); if (ret) return ret; @@ -1484,6 +1522,8 @@ int sdhci_resume_host(struct sdhci_host *host) if (ret) return ret; + sdhci_enable_card_detection(host); + return 0; } @@ -1743,6 +1783,8 @@ int sdhci_add_host(struct sdhci_host *host) (host->flags & SDHCI_USE_ADMA)?"A":"", (host->flags & SDHCI_USE_DMA)?"DMA":"PIO"); + sdhci_enable_card_detection(host); + return 0; #ifdef SDHCI_USE_LEDS_CLASS @@ -1779,6 +1821,8 @@ void sdhci_remove_host(struct sdhci_host *host, int dead) spin_unlock_irqrestore(&host->lock, flags); } + sdhci_disable_card_detection(host); + mmc_remove_host(host->mmc); #ifdef SDHCI_USE_LEDS_CLASS diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index d9733f841061..a9e25c6c0c0e 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -126,6 +126,7 @@ SDHCI_INT_DATA_AVAIL | SDHCI_INT_SPACE_AVAIL | \ SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_DATA_CRC | \ SDHCI_INT_DATA_END_BIT) +#define SDHCI_INT_ALL_MASK ((unsigned int)-1) #define SDHCI_ACMD12_ERR 0x3C -- cgit v1.2.3-59-g8ed1b From 6aa943ab8994fe6e4ccba22c5bc8150a84268bdd Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:50 +0300 Subject: sdhci: Enable only relevant (DMA/PIO) interrupts during transfers Some hosts (that is, FSL eSDHC) throw PIO interrupts during DMA transfers, this causes tons of unneeded interrupts, and thus highly degraded speed. This patch modifies the driver so that now we only enable relevant (DMA or PIO) interrupts during transfers. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index bd5acfb0c1b8..6fbbc005dd7f 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -167,9 +167,7 @@ static void sdhci_init(struct sdhci_host *host) SDHCI_INT_BUS_POWER | SDHCI_INT_DATA_END_BIT | SDHCI_INT_DATA_CRC | SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_INDEX | SDHCI_INT_END_BIT | SDHCI_INT_CRC | SDHCI_INT_TIMEOUT | - SDHCI_INT_DATA_AVAIL | SDHCI_INT_SPACE_AVAIL | - SDHCI_INT_DMA_END | SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE | - SDHCI_INT_ADMA_ERROR); + SDHCI_INT_DATA_END | SDHCI_INT_RESPONSE); } static void sdhci_reinit(struct sdhci_host *host) @@ -603,6 +601,17 @@ static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_data *data) return count; } +static void sdhci_set_transfer_irqs(struct sdhci_host *host) +{ + u32 pio_irqs = SDHCI_INT_DATA_AVAIL | SDHCI_INT_SPACE_AVAIL; + u32 dma_irqs = SDHCI_INT_DMA_END | SDHCI_INT_ADMA_ERROR; + + if (host->flags & SDHCI_REQ_USE_DMA) + sdhci_clear_set_irqs(host, pio_irqs, dma_irqs); + else + sdhci_clear_set_irqs(host, dma_irqs, pio_irqs); +} + static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) { u8 count; @@ -751,6 +760,8 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) host->blocks = data->blocks; } + sdhci_set_transfer_irqs(host); + /* We do not handle DMA boundaries, so set it to max (512 KiB) */ sdhci_writew(host, SDHCI_MAKE_BLKSZ(7, data->blksz), SDHCI_BLOCK_SIZE); sdhci_writew(host, data->blocks, SDHCI_BLOCK_COUNT); -- cgit v1.2.3-59-g8ed1b From 68d1fb7e229c6f95be4fbbe3eb46b24e41184924 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:52 +0300 Subject: sdhci: Add support for card-detection polling This patch adds SDHCI_QUIRK_BROKEN_CARD_DETECTION quirk. When specified, sdhci driver will set MMC_CAP_NEEDS_POLL MMC host capability, and won't enable card insert/remove interrupts. This is needed for hosts with unreliable card detection, such as FSL eSDHC. The original eSDHC driver was tring to "debounce" card-detection IRQs by reading present state and disabling particular interrupts. But with this debouncing scheme I noticed that sometimes we miss card insertion/removal events. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 17 +++++++++++++++-- drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 6fbbc005dd7f..fc7cb489bdb7 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -112,6 +112,9 @@ static void sdhci_set_card_detection(struct sdhci_host *host, bool enable) { u32 irqs = SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT; + if (host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) + return; + if (enable) sdhci_unmask_irqs(host, irqs); else @@ -1041,6 +1044,7 @@ out: static void sdhci_request(struct mmc_host *mmc, struct mmc_request *mrq) { struct sdhci_host *host; + bool present; unsigned long flags; host = mmc_priv(mmc); @@ -1055,8 +1059,14 @@ static void sdhci_request(struct mmc_host *mmc, struct mmc_request *mrq) host->mrq = mrq; - if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & SDHCI_CARD_PRESENT) - || (host->flags & SDHCI_DEVICE_DEAD)) { + /* If polling, assume that the card is always present. */ + if (host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) + present = true; + else + present = sdhci_readl(host, SDHCI_PRESENT_STATE) & + SDHCI_CARD_PRESENT; + + if (!present || host->flags & SDHCI_DEVICE_DEAD) { host->mrq->cmd->error = -ENOMEDIUM; tasklet_schedule(&host->finish_tasklet); } else @@ -1690,6 +1700,9 @@ int sdhci_add_host(struct sdhci_host *host) if (caps & SDHCI_CAN_DO_HISPD) mmc->caps |= MMC_CAP_SD_HIGHSPEED; + if (host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) + mmc->caps |= MMC_CAP_NEEDS_POLL; + mmc->ocr_avail = 0; if (caps & SDHCI_CAN_VDD_330) mmc->ocr_avail |= MMC_VDD_32_33|MMC_VDD_33_34; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index a9e25c6c0c0e..968d713950f1 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -214,6 +214,8 @@ struct sdhci_host { #define SDHCI_QUIRK_BROKEN_SMALL_PIO (1<<13) /* Controller does not provide transfer-complete interrupt when not busy */ #define SDHCI_QUIRK_NO_BUSY_IRQ (1<<14) +/* Controller has unreliable card detection */ +#define SDHCI_QUIRK_BROKEN_CARD_DETECTION (1<<15) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From c5075a1089e808d8f471ce21b02810cc98ab2692 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:54 +0300 Subject: sdhci: Add support for hosts reporting inverted write-protect state This patch adds SDHCI_QUIRK_INVERTED_WRITE_PROTECT quirk. When specified, the sdhci driver will invert WP state. p.s. Actually, the quirk is more board-specific than controller-specific. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 2 ++ drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index fc7cb489bdb7..c814220d214e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1149,6 +1149,8 @@ static int sdhci_get_ro(struct mmc_host *mmc) spin_unlock_irqrestore(&host->lock, flags); + if (host->quirks & SDHCI_QUIRK_INVERTED_WRITE_PROTECT) + return !!(present & SDHCI_WRITE_PROTECT); return !(present & SDHCI_WRITE_PROTECT); } diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 968d713950f1..6980f2725b85 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -216,6 +216,8 @@ struct sdhci_host { #define SDHCI_QUIRK_NO_BUSY_IRQ (1<<14) /* Controller has unreliable card detection */ #define SDHCI_QUIRK_BROKEN_CARD_DETECTION (1<<15) +/* Controller reports inverted write-protect state */ +#define SDHCI_QUIRK_INVERTED_WRITE_PROTECT (1<<16) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From 4240ff0a02cb52f7d10dc1df6d82ba9c27dba07b Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 17 Mar 2009 00:13:57 +0300 Subject: sdhci: Add get_{max,timeout}_clock callbacks Some controllers do not provide clock information in their capabilities (in the Samsung case, it is because there are multiple clock sources available to the controller). Add hooks to allow the system to supply clock information. p.s. In the original Ben's patch there was a bug that makes sdhci_add_host() return -ENODEV even if callbacks were specified. This is fixed now. Signed-off-by: Ben Dooks Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 22 +++++++++++++++------- drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index c814220d214e..34ab77bd12ae 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1674,19 +1674,27 @@ int sdhci_add_host(struct sdhci_host *host) host->max_clk = (caps & SDHCI_CLOCK_BASE_MASK) >> SDHCI_CLOCK_BASE_SHIFT; + host->max_clk *= 1000000; if (host->max_clk == 0) { - printk(KERN_ERR "%s: Hardware doesn't specify base clock " - "frequency.\n", mmc_hostname(mmc)); - return -ENODEV; + if (!host->ops->get_max_clock) { + printk(KERN_ERR + "%s: Hardware doesn't specify base clock " + "frequency.\n", mmc_hostname(mmc)); + return -ENODEV; + } + host->max_clk = host->ops->get_max_clock(host); } - host->max_clk *= 1000000; host->timeout_clk = (caps & SDHCI_TIMEOUT_CLK_MASK) >> SDHCI_TIMEOUT_CLK_SHIFT; if (host->timeout_clk == 0) { - printk(KERN_ERR "%s: Hardware doesn't specify timeout clock " - "frequency.\n", mmc_hostname(mmc)); - return -ENODEV; + if (!host->ops->get_timeout_clock) { + printk(KERN_ERR + "%s: Hardware doesn't specify timeout clock " + "frequency.\n", mmc_hostname(mmc)); + return -ENODEV; + } + host->timeout_clk = host->ops->get_timeout_clock(host); } if (caps & SDHCI_TIMEOUT_CLK_UNIT) host->timeout_clk *= 1000; diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 6980f2725b85..aab0652a4585 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -285,6 +285,8 @@ struct sdhci_ops { #endif int (*enable_dma)(struct sdhci_host *host); + unsigned int (*get_max_clock)(struct sdhci_host *host); + unsigned int (*get_timeout_clock)(struct sdhci_host *host); }; #ifdef CONFIG_MMC_SDHCI_IO_ACCESSORS -- cgit v1.2.3-59-g8ed1b From 8114634ccb54d67a8c01e5825d95bff4e7f7b357 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:13:59 +0300 Subject: sdhci: Add set_clock callback and a quirk for nonstandard clocks FSL eSDHC hosts have incompatible register map to manage the SDCLK. This patch adds set_clock callback so that drivers could overwrite set_clock behaviour. Similar patch[1] was posted by Ben Dooks, though in Ben's version the callback is named change_clock, plus the patch has some unrelated bits that makes the patch difficult to reuse. [1] http://lkml.org/lkml/2008/12/2/160 Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 6 ++++++ drivers/mmc/host/sdhci.h | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 34ab77bd12ae..75d0ecbce10c 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -949,6 +949,12 @@ static void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) if (clock == host->clock) return; + if (host->ops->set_clock) { + host->ops->set_clock(host, clock); + if (host->quirks & SDHCI_QUIRK_NONSTANDARD_CLOCK) + return; + } + sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); if (clock == 0) diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index aab0652a4585..b9bc622735ba 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -218,6 +218,8 @@ struct sdhci_host { #define SDHCI_QUIRK_BROKEN_CARD_DETECTION (1<<15) /* Controller reports inverted write-protect state */ #define SDHCI_QUIRK_INVERTED_WRITE_PROTECT (1<<16) +/* Controller has nonstandard clock management */ +#define SDHCI_QUIRK_NONSTANDARD_CLOCK (1<<17) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ @@ -284,6 +286,8 @@ struct sdhci_ops { void (*writeb)(struct sdhci_host *host, u8 val, int reg); #endif + void (*set_clock)(struct sdhci_host *host, unsigned int clock); + int (*enable_dma)(struct sdhci_host *host); unsigned int (*get_max_clock)(struct sdhci_host *host); unsigned int (*get_timeout_clock)(struct sdhci_host *host); -- cgit v1.2.3-59-g8ed1b From 3e3bf20756aeee57a40fd37b923263c9a51b8c68 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:14:00 +0300 Subject: sdhci: Add quirk for controllers that need small delays for PIO Small udelay is needed to make eSDHC work in PIO mode. Without the delay reading causes endless interrupt storm, and writing corrupts data. The first guess would be that we must wait for some bit in some register, but I didn't find any reliable bits that change before and after the delay. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 3 +++ drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 75d0ecbce10c..cd6dab34ba54 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -336,6 +336,9 @@ static void sdhci_transfer_pio(struct sdhci_host *host) mask = ~0; while (sdhci_readl(host, SDHCI_PRESENT_STATE) & mask) { + if (host->quirks & SDHCI_QUIRK_PIO_NEEDS_DELAY) + udelay(100); + if (host->data->flags & MMC_DATA_READ) sdhci_read_block_pio(host); else diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index b9bc622735ba..c5ce9ee1a1bc 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -220,6 +220,8 @@ struct sdhci_host { #define SDHCI_QUIRK_INVERTED_WRITE_PROTECT (1<<16) /* Controller has nonstandard clock management */ #define SDHCI_QUIRK_NONSTANDARD_CLOCK (1<<17) +/* Controller does not like fast PIO transfers */ +#define SDHCI_QUIRK_PIO_NEEDS_DELAY (1<<18) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From 063a9dbbce5559770b7e2e2f51bd29adf3ab9b1e Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:14:02 +0300 Subject: sdhci: Add quirk for controllers that need IRQ re-init after reset FSL eSDHC controllers losing signal/interrupt enable states after reset, so we should re-enable them. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 7 +++++++ drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index cd6dab34ba54..3a72fe2cf1cf 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -134,6 +134,7 @@ static void sdhci_disable_card_detection(struct sdhci_host *host) static void sdhci_reset(struct sdhci_host *host, u8 mask) { unsigned long timeout; + u32 uninitialized_var(ier); if (host->quirks & SDHCI_QUIRK_NO_CARD_NO_RESET) { if (!(sdhci_readl(host, SDHCI_PRESENT_STATE) & @@ -141,6 +142,9 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) return; } + if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) + ier = sdhci_readl(host, SDHCI_INT_ENABLE); + sdhci_writeb(host, mask, SDHCI_SOFTWARE_RESET); if (mask & SDHCI_RESET_ALL) @@ -160,6 +164,9 @@ static void sdhci_reset(struct sdhci_host *host, u8 mask) timeout--; mdelay(1); } + + if (host->quirks & SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET) + sdhci_clear_set_irqs(host, SDHCI_INT_ALL_MASK, ier); } static void sdhci_init(struct sdhci_host *host) diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index c5ce9ee1a1bc..2962102b6953 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -222,6 +222,8 @@ struct sdhci_host { #define SDHCI_QUIRK_NONSTANDARD_CLOCK (1<<17) /* Controller does not like fast PIO transfers */ #define SDHCI_QUIRK_PIO_NEEDS_DELAY (1<<18) +/* Controller losing signal/interrupt enable states after reset */ +#define SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET (1<<19) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From 0633f654241483edc8a235ab87264ff6bbcd08d5 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:14:03 +0300 Subject: sdhci: Add quirk for forcing maximum block size to 2048 bytes FSL eSDHC controllers can support maximum block size up to 4096 bytes, the MBL (Maximum Block Length) field in the capabilities register extended by one bit, and is set to 0x3. But the SDHCI core doesn't support blocks of 4096 bytes, and thus forces blksz to the lowest value -- 512 bytes. With this patch we can pin up the blksz to the maximum supported block size, i.e. 2048 bytes. Signed-off-by: Anton Vorontsov Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 20 +++++++++++++------- drivers/mmc/host/sdhci.h | 2 ++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 3a72fe2cf1cf..30d8e3d4e6fd 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1777,13 +1777,19 @@ int sdhci_add_host(struct sdhci_host *host) * Maximum block size. This varies from controller to controller and * is specified in the capabilities register. */ - mmc->max_blk_size = (caps & SDHCI_MAX_BLOCK_MASK) >> SDHCI_MAX_BLOCK_SHIFT; - if (mmc->max_blk_size >= 3) { - printk(KERN_WARNING "%s: Invalid maximum block size, " - "assuming 512 bytes\n", mmc_hostname(mmc)); - mmc->max_blk_size = 512; - } else - mmc->max_blk_size = 512 << mmc->max_blk_size; + if (host->quirks & SDHCI_QUIRK_FORCE_BLK_SZ_2048) { + mmc->max_blk_size = 2; + } else { + mmc->max_blk_size = (caps & SDHCI_MAX_BLOCK_MASK) >> + SDHCI_MAX_BLOCK_SHIFT; + if (mmc->max_blk_size >= 3) { + printk(KERN_WARNING "%s: Invalid maximum block size, " + "assuming 512 bytes\n", mmc_hostname(mmc)); + mmc->max_blk_size = 0; + } + } + + mmc->max_blk_size = 512 << mmc->max_blk_size; /* * Maximum block count. diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 2962102b6953..f20a834f4309 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -224,6 +224,8 @@ struct sdhci_host { #define SDHCI_QUIRK_PIO_NEEDS_DELAY (1<<18) /* Controller losing signal/interrupt enable states after reset */ #define SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET (1<<19) +/* Controller has to be forced to use block size of 2048 bytes */ +#define SDHCI_QUIRK_FORCE_BLK_SZ_2048 (1<<20) int irq; /* Device IRQ */ void __iomem * ioaddr; /* Mapped address */ -- cgit v1.2.3-59-g8ed1b From 3085e9c1b24ab2322230d35efac72147b8213865 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 17 Mar 2009 00:14:05 +0300 Subject: mmc: Add OpenFirmware bindings for SDHCI driver This patch adds a new driver: sdhci-of. The driver is similar to the sdhci-pci, it contains common probe code, and controller-specific ops and quirks. So far there are only Freescale eSDHC ops and quirks. Signed-off-by: Anton Vorontsov Acked-by: Arnd Bergmann Signed-off-by: Pierre Ossman --- MAINTAINERS | 7 + drivers/mmc/host/Kconfig | 11 ++ drivers/mmc/host/Makefile | 1 + drivers/mmc/host/sdhci-of.c | 309 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 328 insertions(+) create mode 100644 drivers/mmc/host/sdhci-of.c diff --git a/MAINTAINERS b/MAINTAINERS index d17916ce2155..01cf1eaaec5f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3843,6 +3843,13 @@ M: drzeus-sdhci@drzeus.cx L: sdhci-devel@lists.ossman.eu S: Maintained +SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF) +P: Anton Vorontsov +M: avorontsov@ru.mvista.com +L: linuxppc-dev@ozlabs.org +L: sdhci-devel@lists.ossman.eu +S: Maintained + SECURITY SUBSYSTEM F: security/ P: James Morris diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index e8a7b50a01e7..126a0da095e4 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -72,6 +72,17 @@ config MMC_RICOH_MMC If unsure, say Y. +config MMC_SDHCI_OF + tristate "SDHCI support on OpenFirmware platforms" + depends on MMC_SDHCI && PPC_OF + select MMC_SDHCI_IO_ACCESSORS + help + This selects the OF support for Secure Digital Host Controller + Interfaces. So far, only the Freescale eSDHC controller is known + to exist on OF platforms. + + If unsure, say N. + config MMC_OMAP tristate "TI OMAP Multimedia Card Interface support" depends on ARCH_OMAP diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index c006eb47e592..970a997620e1 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_MMC_MXC) += mxcmmc.o obj-$(CONFIG_MMC_SDHCI) += sdhci.o obj-$(CONFIG_MMC_SDHCI_PCI) += sdhci-pci.o obj-$(CONFIG_MMC_RICOH_MMC) += ricoh_mmc.o +obj-$(CONFIG_MMC_SDHCI_OF) += sdhci-of.o obj-$(CONFIG_MMC_WBSD) += wbsd.o obj-$(CONFIG_MMC_AU1X) += au1xmmc.o obj-$(CONFIG_MMC_OMAP) += omap.o diff --git a/drivers/mmc/host/sdhci-of.c b/drivers/mmc/host/sdhci-of.c new file mode 100644 index 000000000000..3ff4ac3abe8b --- /dev/null +++ b/drivers/mmc/host/sdhci-of.c @@ -0,0 +1,309 @@ +/* + * OpenFirmware bindings for Secure Digital Host Controller Interface. + * + * Copyright (c) 2007 Freescale Semiconductor, Inc. + * Copyright (c) 2009 MontaVista Software, Inc. + * + * Authors: Xiaobo Xie + * Anton Vorontsov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sdhci.h" + +struct sdhci_of_data { + unsigned int quirks; + struct sdhci_ops ops; +}; + +struct sdhci_of_host { + unsigned int clock; + u16 xfer_mode_shadow; +}; + +/* + * Ops and quirks for the Freescale eSDHC controller. + */ + +#define ESDHC_DMA_SYSCTL 0x40c +#define ESDHC_DMA_SNOOP 0x00000040 + +#define ESDHC_SYSTEM_CONTROL 0x2c +#define ESDHC_CLOCK_MASK 0x0000fff0 +#define ESDHC_PREDIV_SHIFT 8 +#define ESDHC_DIVIDER_SHIFT 4 +#define ESDHC_CLOCK_PEREN 0x00000004 +#define ESDHC_CLOCK_HCKEN 0x00000002 +#define ESDHC_CLOCK_IPGEN 0x00000001 + +static u32 esdhc_readl(struct sdhci_host *host, int reg) +{ + return in_be32(host->ioaddr + reg); +} + +static u16 esdhc_readw(struct sdhci_host *host, int reg) +{ + return in_be16(host->ioaddr + (reg ^ 0x2)); +} + +static u8 esdhc_readb(struct sdhci_host *host, int reg) +{ + return in_8(host->ioaddr + (reg ^ 0x3)); +} + +static void esdhc_writel(struct sdhci_host *host, u32 val, int reg) +{ + out_be32(host->ioaddr + reg, val); +} + +static void esdhc_writew(struct sdhci_host *host, u16 val, int reg) +{ + struct sdhci_of_host *of_host = sdhci_priv(host); + int base = reg & ~0x3; + int shift = (reg & 0x2) * 8; + + switch (reg) { + case SDHCI_TRANSFER_MODE: + /* + * Postpone this write, we must do it together with a + * command write that is down below. + */ + of_host->xfer_mode_shadow = val; + return; + case SDHCI_COMMAND: + esdhc_writel(host, val << 16 | of_host->xfer_mode_shadow, + SDHCI_TRANSFER_MODE); + return; + case SDHCI_BLOCK_SIZE: + /* + * Two last DMA bits are reserved, and first one is used for + * non-standard blksz of 4096 bytes that we don't support + * yet. So clear the DMA boundary bits. + */ + val &= ~SDHCI_MAKE_BLKSZ(0x7, 0); + /* fall through */ + } + clrsetbits_be32(host->ioaddr + base, 0xffff << shift, val << shift); +} + +static void esdhc_writeb(struct sdhci_host *host, u8 val, int reg) +{ + int base = reg & ~0x3; + int shift = (reg & 0x3) * 8; + + clrsetbits_be32(host->ioaddr + base , 0xff << shift, val << shift); +} + +static void esdhc_set_clock(struct sdhci_host *host, unsigned int clock) +{ + int div; + int pre_div = 2; + + clrbits32(host->ioaddr + ESDHC_SYSTEM_CONTROL, ESDHC_CLOCK_IPGEN | + ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN | ESDHC_CLOCK_MASK); + + if (clock == 0) + goto out; + + if (host->max_clk / 16 > clock) { + for (; pre_div < 256; pre_div *= 2) { + if (host->max_clk / pre_div < clock * 16) + break; + } + } + + for (div = 1; div <= 16; div++) { + if (host->max_clk / (div * pre_div) <= clock) + break; + } + + pre_div >>= 1; + + setbits32(host->ioaddr + ESDHC_SYSTEM_CONTROL, ESDHC_CLOCK_IPGEN | + ESDHC_CLOCK_HCKEN | ESDHC_CLOCK_PEREN | + div << ESDHC_DIVIDER_SHIFT | pre_div << ESDHC_PREDIV_SHIFT); + mdelay(100); +out: + host->clock = clock; +} + +static int esdhc_enable_dma(struct sdhci_host *host) +{ + setbits32(host->ioaddr + ESDHC_DMA_SYSCTL, ESDHC_DMA_SNOOP); + return 0; +} + +static unsigned int esdhc_get_max_clock(struct sdhci_host *host) +{ + struct sdhci_of_host *of_host = sdhci_priv(host); + + return of_host->clock; +} + +static unsigned int esdhc_get_timeout_clock(struct sdhci_host *host) +{ + struct sdhci_of_host *of_host = sdhci_priv(host); + + return of_host->clock / 1000; +} + +static struct sdhci_of_data sdhci_esdhc = { + .quirks = SDHCI_QUIRK_FORCE_BLK_SZ_2048 | + SDHCI_QUIRK_BROKEN_CARD_DETECTION | + SDHCI_QUIRK_INVERTED_WRITE_PROTECT | + SDHCI_QUIRK_NO_BUSY_IRQ | + SDHCI_QUIRK_NONSTANDARD_CLOCK | + SDHCI_QUIRK_PIO_NEEDS_DELAY | + SDHCI_QUIRK_RESTORE_IRQS_AFTER_RESET | + SDHCI_QUIRK_NO_CARD_NO_RESET, + .ops = { + .readl = esdhc_readl, + .readw = esdhc_readw, + .readb = esdhc_readb, + .writel = esdhc_writel, + .writew = esdhc_writew, + .writeb = esdhc_writeb, + .set_clock = esdhc_set_clock, + .enable_dma = esdhc_enable_dma, + .get_max_clock = esdhc_get_max_clock, + .get_timeout_clock = esdhc_get_timeout_clock, + }, +}; + +#ifdef CONFIG_PM + +static int sdhci_of_suspend(struct of_device *ofdev, pm_message_t state) +{ + struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); + + return mmc_suspend_host(host->mmc, state); +} + +static int sdhci_of_resume(struct of_device *ofdev) +{ + struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); + + return mmc_resume_host(host->mmc); +} + +#else + +#define sdhci_of_suspend NULL +#define sdhci_of_resume NULL + +#endif + +static int __devinit sdhci_of_probe(struct of_device *ofdev, + const struct of_device_id *match) +{ + struct device_node *np = ofdev->node; + struct sdhci_of_data *sdhci_of_data = match->data; + struct sdhci_host *host; + struct sdhci_of_host *of_host; + const u32 *clk; + int size; + int ret; + + if (!of_device_is_available(np)) + return -ENODEV; + + host = sdhci_alloc_host(&ofdev->dev, sizeof(*of_host)); + if (!host) + return -ENOMEM; + + of_host = sdhci_priv(host); + dev_set_drvdata(&ofdev->dev, host); + + host->ioaddr = of_iomap(np, 0); + if (!host->ioaddr) { + ret = -ENOMEM; + goto err_addr_map; + } + + host->irq = irq_of_parse_and_map(np, 0); + if (!host->irq) { + ret = -EINVAL; + goto err_no_irq; + } + + host->hw_name = dev_name(&ofdev->dev); + if (sdhci_of_data) { + host->quirks = sdhci_of_data->quirks; + host->ops = &sdhci_of_data->ops; + } + + clk = of_get_property(np, "clock-frequency", &size); + if (clk && size == sizeof(*clk) && *clk) + of_host->clock = *clk; + + ret = sdhci_add_host(host); + if (ret) + goto err_add_host; + + return 0; + +err_add_host: + irq_dispose_mapping(host->irq); +err_no_irq: + iounmap(host->ioaddr); +err_addr_map: + sdhci_free_host(host); + return ret; +} + +static int __devexit sdhci_of_remove(struct of_device *ofdev) +{ + struct sdhci_host *host = dev_get_drvdata(&ofdev->dev); + + sdhci_remove_host(host, 0); + sdhci_free_host(host); + irq_dispose_mapping(host->irq); + iounmap(host->ioaddr); + return 0; +} + +static const struct of_device_id sdhci_of_match[] = { + { .compatible = "fsl,mpc8379-esdhc", .data = &sdhci_esdhc, }, + { .compatible = "fsl,mpc8536-esdhc", .data = &sdhci_esdhc, }, + { .compatible = "generic-sdhci", }, + {}, +}; +MODULE_DEVICE_TABLE(of, sdhci_of_match); + +static struct of_platform_driver sdhci_of_driver = { + .driver.name = "sdhci-of", + .match_table = sdhci_of_match, + .probe = sdhci_of_probe, + .remove = __devexit_p(sdhci_of_remove), + .suspend = sdhci_of_suspend, + .resume = sdhci_of_resume, +}; + +static int __init sdhci_of_init(void) +{ + return of_register_platform_driver(&sdhci_of_driver); +} +module_init(sdhci_of_init); + +static void __exit sdhci_of_exit(void) +{ + of_unregister_platform_driver(&sdhci_of_driver); +} +module_exit(sdhci_of_exit); + +MODULE_DESCRIPTION("Secure Digital Host Controller Interface OF driver"); +MODULE_AUTHOR("Xiaobo Xie , " + "Anton Vorontsov "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b From d719f9006e0591196fe9c7181e1d1cb4d5ee1edf Mon Sep 17 00:00:00 2001 From: Pierre Ossman Date: Tue, 24 Mar 2009 21:06:09 +0100 Subject: tmio_mmc: add maintainer This is Ian's baby, so make a note of it in MAINTAINERS. Signed-off-by: Pierre Ossman --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 01cf1eaaec5f..ef5522153f4b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4311,6 +4311,11 @@ L: tlinux-users@tce.toshiba-dme.co.jp W: http://www.buzzard.org.uk/toshiba/ S: Maintained +TMIO MMC DRIVER +P: Ian Molton +M: ian@mnementh.co.uk +S: Maintained + TPM DEVICE DRIVER P: Debora Velarde M: debora@linux.vnet.ibm.com -- cgit v1.2.3-59-g8ed1b From 32ab83a56fdf42f543b86c349143c2a86ead9707 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 11:06:06 +0100 Subject: atmel-mci: fix sdc_reg typo This fixes a bug when setting the sdc_reg for 4-bit bus width transactions. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen Signed-off-by: Pierre Ossman --- drivers/mmc/host/atmel-mci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index e94e92001e7c..cf6a100bb38f 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -812,7 +812,7 @@ static void atmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) slot->sdc_reg |= MCI_SDCBUS_1BIT; break; case MMC_BUS_WIDTH_4: - slot->sdc_reg = MCI_SDCBUS_4BIT; + slot->sdc_reg |= MCI_SDCBUS_4BIT; break; } -- cgit v1.2.3-59-g8ed1b From 0a5d649018b151cb9331c213a843ac4a3e7e44ab Mon Sep 17 00:00:00 2001 From: Jody McIntyre Date: Tue, 24 Mar 2009 16:00:28 -0400 Subject: tracing: Documentation / sample code fixes for tracepoints Fix the tracepoint documentation to refer to "tracepoint-sample" instead of "tracepoint-example" to match what actually exists; fix the directory, and clarify how to compile. Change every instance of "example" in the sample tracepoint code to "sample" for consistency. Signed-off-by: Jody McIntyre Acked-by: Mathieu Desnoyers Cc: torvalds@linux-foundation.org LKML-Reference: <20090324200027.GH8294@clouds> Signed-off-by: Ingo Molnar --- Documentation/tracepoints.txt | 13 +++++++------ samples/tracepoints/tracepoint-sample.c | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt index 4ff43c6de299..c0e1ceed75a4 100644 --- a/Documentation/tracepoints.txt +++ b/Documentation/tracepoints.txt @@ -103,13 +103,14 @@ used to export the defined tracepoints. * Probe / tracepoint example -See the example provided in samples/tracepoints/src +See the example provided in samples/tracepoints -Compile them with your kernel. +Compile them with your kernel. They are built during 'make' (not +'make modules') when CONFIG_SAMPLE_TRACEPOINTS=m. Run, as root : -modprobe tracepoint-example (insmod order is not important) -modprobe tracepoint-probe-example -cat /proc/tracepoint-example (returns an expected error) -rmmod tracepoint-example tracepoint-probe-example +modprobe tracepoint-sample (insmod order is not important) +modprobe tracepoint-probe-sample +cat /proc/tracepoint-sample (returns an expected error) +rmmod tracepoint-sample tracepoint-probe-sample dmesg diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c index 68d5dc0310e4..9cf80a11e8b6 100644 --- a/samples/tracepoints/tracepoint-sample.c +++ b/samples/tracepoints/tracepoint-sample.c @@ -1,6 +1,6 @@ /* tracepoint-sample.c * - * Executes a tracepoint when /proc/tracepoint-example is opened. + * Executes a tracepoint when /proc/tracepoint-sample is opened. * * (C) Copyright 2007 Mathieu Desnoyers * @@ -16,7 +16,7 @@ DEFINE_TRACE(subsys_event); DEFINE_TRACE(subsys_eventb); -struct proc_dir_entry *pentry_example; +struct proc_dir_entry *pentry_sample; static int my_open(struct inode *inode, struct file *file) { @@ -32,25 +32,25 @@ static struct file_operations mark_ops = { .open = my_open, }; -static int __init example_init(void) +static int __init sample_init(void) { - printk(KERN_ALERT "example init\n"); - pentry_example = proc_create("tracepoint-example", 0444, NULL, + printk(KERN_ALERT "sample init\n"); + pentry_sample = proc_create("tracepoint-sample", 0444, NULL, &mark_ops); - if (!pentry_example) + if (!pentry_sample) return -EPERM; return 0; } -static void __exit example_exit(void) +static void __exit sample_exit(void) { - printk(KERN_ALERT "example exit\n"); - remove_proc_entry("tracepoint-example", NULL); + printk(KERN_ALERT "sample exit\n"); + remove_proc_entry("tracepoint-sample", NULL); } -module_init(example_init) -module_exit(example_exit) +module_init(sample_init) +module_exit(sample_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Mathieu Desnoyers"); -MODULE_DESCRIPTION("Tracepoint example"); +MODULE_DESCRIPTION("Tracepoint sample"); -- cgit v1.2.3-59-g8ed1b From 61dd7eb876344904aece6b1837da6ea1cbadbc07 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Wed, 25 Mar 2009 02:49:03 +0300 Subject: [MTD] [MAPS] Drop now unused sharpsl-flash map Now as all PXA Zaurii are converted to use the physmap map, drop the sharpsl-flash map completely. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: David Woodhouse --- drivers/mtd/maps/Kconfig | 6 -- drivers/mtd/maps/Makefile | 1 - drivers/mtd/maps/sharpsl-flash.c | 116 --------------------------------------- 3 files changed, 123 deletions(-) delete mode 100644 drivers/mtd/maps/sharpsl-flash.c diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index a0da8e243cc4..a2d9b95bcdd7 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -529,12 +529,6 @@ config MTD_DMV182 help Map driver for Dy-4 SVME/DMV-182 board. -config MTD_SHARP_SL - tristate "ROM mapped on Sharp SL Series" - depends on ARCH_PXA - help - This enables access to the flash chip on the Sharp SL Series of PDAs. - config MTD_INTEL_VR_NOR tristate "NOR flash on Intel Vermilion Range Expansion Bus CS0" depends on PCI diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile index f53a7dc4a4fb..ba62013485a7 100644 --- a/drivers/mtd/maps/Makefile +++ b/drivers/mtd/maps/Makefile @@ -56,7 +56,6 @@ obj-$(CONFIG_MTD_IXP4XX) += ixp4xx.o obj-$(CONFIG_MTD_IXP2000) += ixp2000.o obj-$(CONFIG_MTD_WRSBC8260) += wr_sbc82xx_flash.o obj-$(CONFIG_MTD_DMV182) += dmv182.o -obj-$(CONFIG_MTD_SHARP_SL) += sharpsl-flash.o obj-$(CONFIG_MTD_PLATRAM) += plat-ram.o obj-$(CONFIG_MTD_OMAP_NOR) += omap_nor.o obj-$(CONFIG_MTD_INTEL_VR_NOR) += intel_vr_nor.o diff --git a/drivers/mtd/maps/sharpsl-flash.c b/drivers/mtd/maps/sharpsl-flash.c deleted file mode 100644 index b392f096c706..000000000000 --- a/drivers/mtd/maps/sharpsl-flash.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * sharpsl-flash.c - * - * Copyright (C) 2001 Lineo Japan, Inc. - * Copyright (C) 2002 SHARP - * - * based on rpxlite.c,v 1.15 2001/10/02 15:05:14 dwmw2 Exp - * Handle mapping of the flash on the RPX Lite and CLLF boards - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define WINDOW_ADDR 0x00000000 -#define WINDOW_SIZE 0x00800000 -#define BANK_WIDTH 2 - -static struct mtd_info *mymtd; - -struct map_info sharpsl_map = { - .name = "sharpsl-flash", - .size = WINDOW_SIZE, - .bankwidth = BANK_WIDTH, - .phys = WINDOW_ADDR -}; - -static struct mtd_partition sharpsl_partitions[1] = { - { - name: "Boot PROM Filesystem", - } -}; - -static int __init init_sharpsl(void) -{ - struct mtd_partition *parts; - int nb_parts = 0; - char *part_type = "static"; - - printk(KERN_NOTICE "Sharp SL series flash device: %x at %x\n", - WINDOW_SIZE, WINDOW_ADDR); - sharpsl_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE); - if (!sharpsl_map.virt) { - printk("Failed to ioremap\n"); - return -EIO; - } - - simple_map_init(&sharpsl_map); - - mymtd = do_map_probe("map_rom", &sharpsl_map); - if (!mymtd) { - iounmap(sharpsl_map.virt); - return -ENXIO; - } - - mymtd->owner = THIS_MODULE; - - if (machine_is_corgi() || machine_is_shepherd() || machine_is_husky() - || machine_is_poodle()) { - sharpsl_partitions[0].size=0x006d0000; - sharpsl_partitions[0].offset=0x00120000; - } else if (machine_is_tosa()) { - sharpsl_partitions[0].size=0x006a0000; - sharpsl_partitions[0].offset=0x00160000; - } else if (machine_is_spitz() || machine_is_akita() || machine_is_borzoi()) { - sharpsl_partitions[0].size=0x006b0000; - sharpsl_partitions[0].offset=0x00140000; - } else { - map_destroy(mymtd); - iounmap(sharpsl_map.virt); - return -ENODEV; - } - - parts = sharpsl_partitions; - nb_parts = ARRAY_SIZE(sharpsl_partitions); - - printk(KERN_NOTICE "Using %s partition definition\n", part_type); - add_mtd_partitions(mymtd, parts, nb_parts); - - return 0; -} - -static void __exit cleanup_sharpsl(void) -{ - if (mymtd) { - del_mtd_partitions(mymtd); - map_destroy(mymtd); - } - if (sharpsl_map.virt) { - iounmap(sharpsl_map.virt); - sharpsl_map.virt = 0; - } -} - -module_init(init_sharpsl); -module_exit(cleanup_sharpsl); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("SHARP (Original: Arnold Christensen )"); -MODULE_DESCRIPTION("MTD map driver for SHARP SL series"); -- cgit v1.2.3-59-g8ed1b From 3199aa6bc8766e17b8f60820c4f78d59c25fce0e Mon Sep 17 00:00:00 2001 From: "Han, Weidong" Date: Thu, 26 Feb 2009 17:31:12 +0800 Subject: intel-iommu: fix PCI device detach from virtual machine When assign a device behind conventional PCI bridge or PCIe to PCI/PCI-x bridge to a domain, it must assign its bridge and may also need to assign secondary interface to the same domain. Dependent assignment is already there, but dependent deassignment is missed when detach device from virtual machine. This results in conventional PCI device assignment failure after it has been assigned once. This patch addes dependent deassignment, and fixes the issue. Signed-off-by: Weidong Han Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index a0ba568b831c..e541c3bdbf0d 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2808,6 +2808,33 @@ static int vm_domain_add_dev_info(struct dmar_domain *domain, return 0; } +static void iommu_detach_dependent_devices(struct intel_iommu *iommu, + struct pci_dev *pdev) +{ + struct pci_dev *tmp, *parent; + + if (!iommu || !pdev) + return; + + /* dependent device detach */ + tmp = pci_find_upstream_pcie_bridge(pdev); + /* Secondary interface's bus number and devfn 0 */ + if (tmp) { + parent = pdev->bus->self; + while (parent != tmp) { + iommu_detach_dev(iommu, parent->bus->number, + parent->devfn); + parent = parent->bus->self; + } + if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ + iommu_detach_dev(iommu, + tmp->subordinate->number, 0); + else /* this is a legacy PCI bridge */ + iommu_detach_dev(iommu, + tmp->bus->number, tmp->devfn); + } +} + static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { @@ -2833,6 +2860,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, spin_unlock_irqrestore(&device_domain_lock, flags); iommu_detach_dev(iommu, info->bus, info->devfn); + iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); spin_lock_irqsave(&device_domain_lock, flags); @@ -2882,6 +2910,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) iommu = device_to_iommu(info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); + iommu_detach_dependent_devices(iommu, info->dev); /* clear this iommu in iommu_bmp, update iommu count * and capabilities -- cgit v1.2.3-59-g8ed1b From 4cf2e75d0bec15d945972b005056c4a8731b82cf Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Feb 2009 17:23:43 +0000 Subject: intel-iommu: Enable DMAR on 32-bit kernel. If we fix a few highmem-related thinkos and a couple of printk format warnings, the Intel IOMMU driver works fine in a 32-bit kernel. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 2 +- drivers/pci/intel-iommu.c | 20 ++++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bc2fbadff9f9..5ff2252ec475 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1794,7 +1794,7 @@ config PCI_MMCONFIG config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" - depends on X86_64 && PCI_MSI && ACPI && EXPERIMENTAL + depends on PCI_MSI && ACPI && EXPERIMENTAL help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index e541c3bdbf0d..0c12d06bade6 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2315,7 +2315,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, error: if (iova) __free_iova(&domain->iovad, iova); - printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n", + printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n", pci_name(pdev), size, (unsigned long long)paddr, dir); return 0; } @@ -2411,7 +2411,7 @@ void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size, start_addr = iova->pfn_lo << PAGE_SHIFT; size = aligned_size((u64)dev_addr, size); - pr_debug("Device %s unmapping: %lx@%llx\n", + pr_debug("Device %s unmapping: %zx@%llx\n", pci_name(pdev), size, (unsigned long long)start_addr); /* clear the whole page */ @@ -2469,8 +2469,6 @@ void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, order); } -#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg))) - void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, int dir) { @@ -2480,7 +2478,7 @@ void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, unsigned long start_addr; struct iova *iova; size_t size = 0; - void *addr; + phys_addr_t addr; struct scatterlist *sg; struct intel_iommu *iommu; @@ -2496,7 +2494,7 @@ void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, if (!iova) return; for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); + addr = page_to_phys(sg_page(sg)) + sg->offset; size += aligned_size((u64)addr, sg->length); } @@ -2523,7 +2521,7 @@ static int intel_nontranslate_map_sg(struct device *hddev, for_each_sg(sglist, sg, nelems, i) { BUG_ON(!sg_page(sg)); - sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg)); + sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset; sg->dma_length = sg->length; } return nelems; @@ -2532,7 +2530,7 @@ static int intel_nontranslate_map_sg(struct device *hddev, int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, int dir) { - void *addr; + phys_addr_t addr; int i; struct pci_dev *pdev = to_pci_dev(hwdev); struct dmar_domain *domain; @@ -2556,8 +2554,7 @@ int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, iommu = domain_get_iommu(domain); for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); - addr = (void *)virt_to_phys(addr); + addr = page_to_phys(sg_page(sg)) + sg->offset; size += aligned_size((u64)addr, sg->length); } @@ -2580,8 +2577,7 @@ int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, start_addr = iova->pfn_lo << PAGE_SHIFT; offset = 0; for_each_sg(sglist, sg, nelems, i) { - addr = SG_ENT_VIRT_ADDRESS(sg); - addr = (void *)virt_to_phys(addr); + addr = page_to_phys(sg_page(sg)) + sg->offset; size = aligned_size((u64)addr, sg->length); ret = domain_page_mapping(domain, start_addr + offset, ((u64)addr) & PAGE_MASK, -- cgit v1.2.3-59-g8ed1b From afeeb7cebbd223ffee303fd8de4ba97458b13581 Mon Sep 17 00:00:00 2001 From: "Zhao, Yu" Date: Fri, 13 Feb 2009 17:55:49 +0800 Subject: intel-iommu: Fix address wrap on 32-bit kernel. The problem is in dma_pte_clear_range and dma_pte_free_pagetable. When intel_unmap_single and intel_unmap_sg call them, the end address may be zero if the 'start_addr + size' rounds up. So no PTE gets cleared. The uncleared PTE fires the BUG_ON when it's used again to create new mappings. After I modified dma_pte_clear_range a bit, the BUG_ON is gone. Tested both 32 and 32 PAE modes on Intel X58 and Q35 platforms. Signed-off-by: Yu Zhao Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 0c12d06bade6..002c8b95edf8 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -718,15 +718,17 @@ static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr) static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end) { int addr_width = agaw_to_width(domain->agaw); + int npages; start &= (((u64)1) << addr_width) - 1; end &= (((u64)1) << addr_width) - 1; /* in case it's partial page */ start = PAGE_ALIGN(start); end &= PAGE_MASK; + npages = (end - start) / VTD_PAGE_SIZE; /* we don't need lock here, nobody else touches the iova range */ - while (start < end) { + while (npages--) { dma_pte_clear_one(domain, start); start += VTD_PAGE_SIZE; } -- cgit v1.2.3-59-g8ed1b From 257b17ca030387cb17314cd1851507bdd1b4ddd5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:23 -0700 Subject: dmaengine: fail device registration if channel registration fails Atsushi points out: "If alloc_percpu or kzalloc failed, chan_id does not match with its position in device->channels list. And above "continue" looks buggy anyway. Keeping incomplete channels in device->channels list looks very dangerous..." Also, fix up leakage of idr_ref in the idr_pre_get() and channel init fail cases. Reported-by: Atsushi Nemoto Signed-off-by: Dan Williams --- drivers/dma/dmaengine.c | 51 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 280a9d263eb3..49243d14b894 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -602,6 +602,24 @@ void dmaengine_put(void) } EXPORT_SYMBOL(dmaengine_put); +static int get_dma_id(struct dma_device *device) +{ + int rc; + + idr_retry: + if (!idr_pre_get(&dma_idr, GFP_KERNEL)) + return -ENOMEM; + mutex_lock(&dma_list_mutex); + rc = idr_get_new(&dma_idr, NULL, &device->dev_id); + mutex_unlock(&dma_list_mutex); + if (rc == -EAGAIN) + goto idr_retry; + else if (rc != 0) + return rc; + + return 0; +} + /** * dma_async_device_register - registers DMA devices found * @device: &dma_device @@ -640,27 +658,25 @@ int dma_async_device_register(struct dma_device *device) idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); if (!idr_ref) return -ENOMEM; - atomic_set(idr_ref, 0); - idr_retry: - if (!idr_pre_get(&dma_idr, GFP_KERNEL)) - return -ENOMEM; - mutex_lock(&dma_list_mutex); - rc = idr_get_new(&dma_idr, NULL, &device->dev_id); - mutex_unlock(&dma_list_mutex); - if (rc == -EAGAIN) - goto idr_retry; - else if (rc != 0) + rc = get_dma_id(device); + if (rc != 0) { + kfree(idr_ref); return rc; + } + + atomic_set(idr_ref, 0); /* represent channels in sysfs. Probably want devs too */ list_for_each_entry(chan, &device->channels, device_node) { + rc = -ENOMEM; chan->local = alloc_percpu(typeof(*chan->local)); if (chan->local == NULL) - continue; + goto err_out; chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL); if (chan->dev == NULL) { free_percpu(chan->local); - continue; + chan->local = NULL; + goto err_out; } chan->chan_id = chancnt++; @@ -677,6 +693,8 @@ int dma_async_device_register(struct dma_device *device) if (rc) { free_percpu(chan->local); chan->local = NULL; + kfree(chan->dev); + atomic_dec(idr_ref); goto err_out; } chan->client_count = 0; @@ -707,6 +725,15 @@ int dma_async_device_register(struct dma_device *device) return 0; err_out: + /* if we never registered a channel just release the idr */ + if (atomic_read(idr_ref) == 0) { + mutex_lock(&dma_list_mutex); + idr_remove(&dma_idr, device->dev_id); + mutex_unlock(&dma_list_mutex); + kfree(idr_ref); + return rc; + } + list_for_each_entry(chan, &device->channels, device_node) { if (chan->local == NULL) continue; -- cgit v1.2.3-59-g8ed1b From 0149f7d5dc66dcffbb044ba005a5378a5864d2a3 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Wed, 25 Mar 2009 09:13:23 -0700 Subject: dma: ipu_idmac driver cosmetic clean-up Remove superfluous semicolons, update comments. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Dan Williams --- drivers/dma/ipu/ipu_idmac.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index ae50a9d1a4e6..a6f7294c8598 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -107,7 +107,7 @@ static uint32_t bytes_per_pixel(enum pixel_fmt fmt) } } -/* Enable / disable direct write to memory by the Camera Sensor Interface */ +/* Enable direct write to memory by the Camera Sensor Interface */ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel) { uint32_t ic_conf, mask; @@ -126,6 +126,7 @@ static void ipu_ic_enable_task(struct ipu *ipu, enum ipu_channel channel) idmac_write_icreg(ipu, ic_conf, IC_CONF); } +/* Called under spin_lock_irqsave(&ipu_data.lock) */ static void ipu_ic_disable_task(struct ipu *ipu, enum ipu_channel channel) { uint32_t ic_conf, mask; @@ -422,7 +423,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params, break; default: dev_err(ipu_data.dev, - "mxc ipu: unimplemented pixel format %d\n", pixel_fmt); + "mx3 ipu: unimplemented pixel format %d\n", pixel_fmt); break; } @@ -433,20 +434,20 @@ static void ipu_ch_param_set_burst_size(union chan_param_mem *params, uint16_t burst_pixels) { params->pp.npb = burst_pixels - 1; -}; +} static void ipu_ch_param_set_buffer(union chan_param_mem *params, dma_addr_t buf0, dma_addr_t buf1) { params->pp.eba0 = buf0; params->pp.eba1 = buf1; -}; +} static void ipu_ch_param_set_rotation(union chan_param_mem *params, enum ipu_rotate_mode rotate) { params->pp.bam = rotate; -}; +} static void ipu_write_param_mem(uint32_t addr, uint32_t *data, uint32_t num_words) @@ -571,7 +572,7 @@ static uint32_t dma_param_addr(uint32_t dma_ch) { /* Channel Parameter Memory */ return 0x10000 | (dma_ch << 4); -}; +} static void ipu_channel_set_priority(struct ipu *ipu, enum ipu_channel channel, bool prio) @@ -611,7 +612,8 @@ static uint32_t ipu_channel_conf_mask(enum ipu_channel channel) /** * ipu_enable_channel() - enable an IPU channel. - * @channel: channel ID. + * @idmac: IPU DMAC context. + * @ichan: IDMAC channel. * @return: 0 on success or negative error code on failure. */ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan) @@ -649,7 +651,7 @@ static int ipu_enable_channel(struct idmac *idmac, struct idmac_channel *ichan) /** * ipu_init_channel_buffer() - initialize a buffer for logical IPU channel. - * @channel: channel ID. + * @ichan: IDMAC channel. * @pixel_fmt: pixel format of buffer. Pixel format is a FOURCC ASCII code. * @width: width of buffer in pixels. * @height: height of buffer in pixels. @@ -687,7 +689,7 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan, } /* IC channel's stride must be a multiple of 8 pixels */ - if ((channel <= 13) && (stride % 8)) { + if ((channel <= IDMAC_IC_13) && (stride % 8)) { dev_err(ipu->dev, "Stride must be 8 pixel multiple\n"); return -EINVAL; } @@ -752,7 +754,7 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n) /** * ipu_update_channel_buffer() - update physical address of a channel buffer. - * @channel: channel ID. + * @ichan: IDMAC channel. * @buffer_n: buffer number to update. * 0 or 1 are the only valid values. * @phyaddr: buffer physical address. @@ -1341,13 +1343,7 @@ static void ipu_gc_tasklet(unsigned long arg) } } -/* - * At the time .device_alloc_chan_resources() method is called, we cannot know, - * whether the client will accept the channel. Thus we must only check, if we - * can satisfy client's request but the only real criterion to verify, whether - * the client has accepted our offer is the client_count. That's why we have to - * perform the rest of our allocation tasks on the first call to this function. - */ +/* Allocate and initialise a transfer descriptor. */ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, enum dma_data_direction direction, unsigned long tx_flags) @@ -1432,8 +1428,7 @@ static void __idmac_terminate_all(struct dma_chan *chan) struct idmac_tx_desc *desc = ichan->desc + i; if (list_empty(&desc->list)) /* Descriptor was prepared, but not submitted */ - list_add(&desc->list, - &ichan->free_list); + list_add(&desc->list, &ichan->free_list); async_tx_clear_ack(&desc->txd); } -- cgit v1.2.3-59-g8ed1b From 234f2df56f5b05756c444edc9879145deddf69f4 Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Wed, 25 Mar 2009 09:13:24 -0700 Subject: dma: improve section assignment in i.MX31 IPU DMA driver The i.MX31 IPU DMA driver is a platform driver, but doesn't need hotplug, so we can use __init and __exit function attributes. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Dan Williams --- drivers/dma/ipu/ipu_idmac.c | 8 ++++---- drivers/dma/ipu/ipu_irq.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index a6f7294c8598..b759ae9315ba 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -1576,7 +1576,7 @@ static int __init ipu_idmac_init(struct ipu *ipu) return dma_async_device_register(&idmac->dma); } -static void ipu_idmac_exit(struct ipu *ipu) +static void __exit ipu_idmac_exit(struct ipu *ipu) { int i; struct idmac *idmac = &ipu->idmac; @@ -1595,7 +1595,7 @@ static void ipu_idmac_exit(struct ipu *ipu) * IPU common probe / remove */ -static int ipu_probe(struct platform_device *pdev) +static int __init ipu_probe(struct platform_device *pdev) { struct ipu_platform_data *pdata = pdev->dev.platform_data; struct resource *mem_ipu, *mem_ic; @@ -1695,7 +1695,7 @@ err_noirq: return ret; } -static int ipu_remove(struct platform_device *pdev) +static int __exit ipu_remove(struct platform_device *pdev) { struct ipu *ipu = platform_get_drvdata(pdev); @@ -1720,7 +1720,7 @@ static struct platform_driver ipu_platform_driver = { .name = "ipu-core", .owner = THIS_MODULE, }, - .remove = ipu_remove, + .remove = __exit_p(ipu_remove), }; static int __init ipu_init(void) diff --git a/drivers/dma/ipu/ipu_irq.c b/drivers/dma/ipu/ipu_irq.c index 83f532cc767f..dd8ebc75b667 100644 --- a/drivers/dma/ipu/ipu_irq.c +++ b/drivers/dma/ipu/ipu_irq.c @@ -352,7 +352,7 @@ static struct irq_chip ipu_irq_chip = { }; /* Install the IRQ handler */ -int ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev) +int __init ipu_irq_attach_irq(struct ipu *ipu, struct platform_device *dev) { struct ipu_platform_data *pdata = dev->dev.platform_data; unsigned int irq, irq_base, i; -- cgit v1.2.3-59-g8ed1b From 8d47bae004f062630f69f7f83d098424252e232d Mon Sep 17 00:00:00 2001 From: Guennadi Liakhovetski Date: Wed, 25 Mar 2009 09:13:24 -0700 Subject: dma: i.MX31 IPU DMA robustness improvements Add DMA error handling to the ISR, move common code fragments to functions, fix scatter-gather element queuing in the ISR, survive channel freeing and re-allocation in a quick succession. Signed-off-by: Guennadi Liakhovetski Signed-off-by: Dan Williams --- drivers/dma/ipu/ipu_idmac.c | 265 ++++++++++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 84 deletions(-) diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index b759ae9315ba..0a525c450f24 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -28,6 +28,9 @@ #define FS_VF_IN_VALID 0x00000002 #define FS_ENC_IN_VALID 0x00000001 +static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, + bool wait_for_stop); + /* * There can be only one, we could allocate it dynamically, but then we'd have * to add an extra parameter to some functions, and use something as ugly as @@ -762,9 +765,10 @@ static void ipu_select_buffer(enum ipu_channel channel, int buffer_n) * function will fail if the buffer is set to ready. */ /* Called under spin_lock(_irqsave)(&ichan->lock) */ -static int ipu_update_channel_buffer(enum ipu_channel channel, +static int ipu_update_channel_buffer(struct idmac_channel *ichan, int buffer_n, dma_addr_t phyaddr) { + enum ipu_channel channel = ichan->dma_chan.chan_id; uint32_t reg; unsigned long flags; @@ -773,8 +777,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, if (buffer_n == 0) { reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY); if (reg & (1UL << channel)) { - spin_unlock_irqrestore(&ipu_data.lock, flags); - return -EACCES; + ipu_ic_disable_task(&ipu_data, channel); + ichan->status = IPU_CHANNEL_READY; } /* 44.3.3.1.9 - Row Number 1 (WORD1, offset 0) */ @@ -784,8 +788,8 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, } else { reg = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY); if (reg & (1UL << channel)) { - spin_unlock_irqrestore(&ipu_data.lock, flags); - return -EACCES; + ipu_ic_disable_task(&ipu_data, channel); + ichan->status = IPU_CHANNEL_READY; } /* Check if double-buffering is already enabled */ @@ -806,6 +810,39 @@ static int ipu_update_channel_buffer(enum ipu_channel channel, return 0; } +/* Called under spin_lock_irqsave(&ichan->lock) */ +static int ipu_submit_buffer(struct idmac_channel *ichan, + struct idmac_tx_desc *desc, struct scatterlist *sg, int buf_idx) +{ + unsigned int chan_id = ichan->dma_chan.chan_id; + struct device *dev = &ichan->dma_chan.dev->device; + int ret; + + if (async_tx_test_ack(&desc->txd)) + return -EINTR; + + /* + * On first invocation this shouldn't be necessary, the call to + * ipu_init_channel_buffer() above will set addresses for us, so we + * could make it conditional on status >= IPU_CHANNEL_ENABLED, but + * doing it again shouldn't hurt either. + */ + ret = ipu_update_channel_buffer(ichan, buf_idx, + sg_dma_address(sg)); + + if (ret < 0) { + dev_err(dev, "Updating sg %p on channel 0x%x buffer %d failed!\n", + sg, chan_id, buf_idx); + return ret; + } + + ipu_select_buffer(chan_id, buf_idx); + dev_dbg(dev, "Updated sg %p on channel 0x%x buffer %d\n", + sg, chan_id, buf_idx); + + return 0; +} + /* Called under spin_lock_irqsave(&ichan->lock) */ static int ipu_submit_channel_buffers(struct idmac_channel *ichan, struct idmac_tx_desc *desc) @@ -817,20 +854,10 @@ static int ipu_submit_channel_buffers(struct idmac_channel *ichan, if (!ichan->sg[i]) { ichan->sg[i] = sg; - /* - * On first invocation this shouldn't be necessary, the - * call to ipu_init_channel_buffer() above will set - * addresses for us, so we could make it conditional - * on status >= IPU_CHANNEL_ENABLED, but doing it again - * shouldn't hurt either. - */ - ret = ipu_update_channel_buffer(ichan->dma_chan.chan_id, i, - sg_dma_address(sg)); + ret = ipu_submit_buffer(ichan, desc, sg, i); if (ret < 0) return ret; - ipu_select_buffer(ichan->dma_chan.chan_id, i); - sg = sg_next(sg); } } @@ -844,19 +871,22 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) struct idmac_channel *ichan = to_idmac_chan(tx->chan); struct idmac *idmac = to_idmac(tx->chan->device); struct ipu *ipu = to_ipu(idmac); + struct device *dev = &ichan->dma_chan.dev->device; dma_cookie_t cookie; unsigned long flags; + int ret; /* Sanity check */ if (!list_empty(&desc->list)) { /* The descriptor doesn't belong to client */ - dev_err(&ichan->dma_chan.dev->device, - "Descriptor %p not prepared!\n", tx); + dev_err(dev, "Descriptor %p not prepared!\n", tx); return -EBUSY; } mutex_lock(&ichan->chan_mutex); + async_tx_clear_ack(tx); + if (ichan->status < IPU_CHANNEL_READY) { struct idmac_video_param *video = &ichan->params.video; /* @@ -880,16 +910,7 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) goto out; } - /* ipu->lock can be taken under ichan->lock, but not v.v. */ - spin_lock_irqsave(&ichan->lock, flags); - - /* submit_buffers() atomically verifies and fills empty sg slots */ - cookie = ipu_submit_channel_buffers(ichan, desc); - - spin_unlock_irqrestore(&ichan->lock, flags); - - if (cookie < 0) - goto out; + dev_dbg(dev, "Submitting sg %p\n", &desc->sg[0]); cookie = ichan->dma_chan.cookie; @@ -899,24 +920,40 @@ static dma_cookie_t idmac_tx_submit(struct dma_async_tx_descriptor *tx) /* from dmaengine.h: "last cookie value returned to client" */ ichan->dma_chan.cookie = cookie; tx->cookie = cookie; + + /* ipu->lock can be taken under ichan->lock, but not v.v. */ spin_lock_irqsave(&ichan->lock, flags); + list_add_tail(&desc->list, &ichan->queue); + /* submit_buffers() atomically verifies and fills empty sg slots */ + ret = ipu_submit_channel_buffers(ichan, desc); + spin_unlock_irqrestore(&ichan->lock, flags); + if (ret < 0) { + cookie = ret; + goto dequeue; + } + if (ichan->status < IPU_CHANNEL_ENABLED) { - int ret = ipu_enable_channel(idmac, ichan); + ret = ipu_enable_channel(idmac, ichan); if (ret < 0) { cookie = ret; - spin_lock_irqsave(&ichan->lock, flags); - list_del_init(&desc->list); - spin_unlock_irqrestore(&ichan->lock, flags); - tx->cookie = cookie; - ichan->dma_chan.cookie = cookie; + goto dequeue; } } dump_idmac_reg(ipu); +dequeue: + if (cookie < 0) { + spin_lock_irqsave(&ichan->lock, flags); + list_del_init(&desc->list); + spin_unlock_irqrestore(&ichan->lock, flags); + tx->cookie = cookie; + ichan->dma_chan.cookie = cookie; + } + out: mutex_unlock(&ichan->chan_mutex); @@ -1163,6 +1200,24 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, return 0; } +static struct scatterlist *idmac_sg_next(struct idmac_channel *ichan, + struct idmac_tx_desc **desc, struct scatterlist *sg) +{ + struct scatterlist *sgnew = sg ? sg_next(sg) : NULL; + + if (sgnew) + /* next sg-element in this list */ + return sgnew; + + if ((*desc)->list.next == &ichan->queue) + /* No more descriptors on the queue */ + return NULL; + + /* Fetch next descriptor */ + *desc = list_entry((*desc)->list.next, struct idmac_tx_desc, list); + return (*desc)->sg; +} + /* * We have several possibilities here: * current BUF next BUF @@ -1178,23 +1233,46 @@ static int ipu_disable_channel(struct idmac *idmac, struct idmac_channel *ichan, static irqreturn_t idmac_interrupt(int irq, void *dev_id) { struct idmac_channel *ichan = dev_id; + struct device *dev = &ichan->dma_chan.dev->device; unsigned int chan_id = ichan->dma_chan.chan_id; struct scatterlist **sg, *sgnext, *sgnew = NULL; /* Next transfer descriptor */ - struct idmac_tx_desc *desc = NULL, *descnew; + struct idmac_tx_desc *desc, *descnew; dma_async_tx_callback callback; void *callback_param; bool done = false; - u32 ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY), - ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY), - curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF); + u32 ready0, ready1, curbuf, err; + unsigned long flags; /* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */ - pr_debug("IDMAC irq %d\n", irq); + dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer); + + spin_lock_irqsave(&ipu_data.lock, flags); + + ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY); + ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY); + curbuf = idmac_read_ipureg(&ipu_data, IPU_CHA_CUR_BUF); + err = idmac_read_ipureg(&ipu_data, IPU_INT_STAT_4); + + if (err & (1 << chan_id)) { + idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4); + spin_unlock_irqrestore(&ipu_data.lock, flags); + /* + * Doing this + * ichan->sg[0] = ichan->sg[1] = NULL; + * you can force channel re-enable on the next tx_submit(), but + * this is dirty - think about descriptors with multiple + * sg elements. + */ + dev_warn(dev, "NFB4EOF on channel %d, ready %x, %x, cur %x\n", + chan_id, ready0, ready1, curbuf); + return IRQ_HANDLED; + } + spin_unlock_irqrestore(&ipu_data.lock, flags); + /* Other interrupts do not interfere with this channel */ spin_lock(&ichan->lock); - if (unlikely(chan_id != IDMAC_SDC_0 && chan_id != IDMAC_SDC_1 && ((curbuf >> chan_id) & 1) == ichan->active_buffer)) { int i = 100; @@ -1209,19 +1287,23 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) if (!i) { spin_unlock(&ichan->lock); - dev_dbg(ichan->dma_chan.device->dev, + dev_dbg(dev, "IRQ on active buffer on channel %x, active " "%d, ready %x, %x, current %x!\n", chan_id, ichan->active_buffer, ready0, ready1, curbuf); return IRQ_NONE; - } + } else + dev_dbg(dev, + "Buffer deactivated on channel %x, active " + "%d, ready %x, %x, current %x, rest %d!\n", chan_id, + ichan->active_buffer, ready0, ready1, curbuf, i); } if (unlikely((ichan->active_buffer && (ready1 >> chan_id) & 1) || (!ichan->active_buffer && (ready0 >> chan_id) & 1) )) { spin_unlock(&ichan->lock); - dev_dbg(ichan->dma_chan.device->dev, + dev_dbg(dev, "IRQ with active buffer still ready on channel %x, " "active %d, ready %x, %x!\n", chan_id, ichan->active_buffer, ready0, ready1); @@ -1229,8 +1311,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) } if (unlikely(list_empty(&ichan->queue))) { + ichan->sg[ichan->active_buffer] = NULL; spin_unlock(&ichan->lock); - dev_err(ichan->dma_chan.device->dev, + dev_err(dev, "IRQ without queued buffers on channel %x, active %d, " "ready %x, %x!\n", chan_id, ichan->active_buffer, ready0, ready1); @@ -1245,40 +1328,44 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) sg = &ichan->sg[ichan->active_buffer]; sgnext = ichan->sg[!ichan->active_buffer]; + if (!*sg) { + spin_unlock(&ichan->lock); + return IRQ_HANDLED; + } + + desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list); + descnew = desc; + + dev_dbg(dev, "IDMAC irq %d, dma 0x%08x, next dma 0x%08x, current %d, curbuf 0x%08x\n", + irq, sg_dma_address(*sg), sgnext ? sg_dma_address(sgnext) : 0, ichan->active_buffer, curbuf); + + /* Find the descriptor of sgnext */ + sgnew = idmac_sg_next(ichan, &descnew, *sg); + if (sgnext != sgnew) + dev_err(dev, "Submitted buffer %p, next buffer %p\n", sgnext, sgnew); + /* * if sgnext == NULL sg must be the last element in a scatterlist and * queue must be empty */ if (unlikely(!sgnext)) { - if (unlikely(sg_next(*sg))) { - dev_err(ichan->dma_chan.device->dev, - "Broken buffer-update locking on channel %x!\n", - chan_id); - /* We'll let the user catch up */ + if (!WARN_ON(sg_next(*sg))) + dev_dbg(dev, "Underrun on channel %x\n", chan_id); + ichan->sg[!ichan->active_buffer] = sgnew; + + if (unlikely(sgnew)) { + ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer); } else { - /* Underrun */ + spin_lock_irqsave(&ipu_data.lock, flags); ipu_ic_disable_task(&ipu_data, chan_id); - dev_dbg(ichan->dma_chan.device->dev, - "Underrun on channel %x\n", chan_id); + spin_unlock_irqrestore(&ipu_data.lock, flags); ichan->status = IPU_CHANNEL_READY; /* Continue to check for complete descriptor */ } } - desc = list_entry(ichan->queue.next, struct idmac_tx_desc, list); - - /* First calculate and submit the next sg element */ - if (likely(sgnext)) - sgnew = sg_next(sgnext); - - if (unlikely(!sgnew)) { - /* Start a new scatterlist, if any queued */ - if (likely(desc->list.next != &ichan->queue)) { - descnew = list_entry(desc->list.next, - struct idmac_tx_desc, list); - sgnew = &descnew->sg[0]; - } - } + /* Calculate and submit the next sg element */ + sgnew = idmac_sg_next(ichan, &descnew, sgnew); if (unlikely(!sg_next(*sg)) || !sgnext) { /* @@ -1291,17 +1378,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) *sg = sgnew; - if (likely(sgnew)) { - int ret; - - ret = ipu_update_channel_buffer(chan_id, ichan->active_buffer, - sg_dma_address(*sg)); - if (ret < 0) - dev_err(ichan->dma_chan.device->dev, - "Failed to update buffer on channel %x buffer %d!\n", - chan_id, ichan->active_buffer); - else - ipu_select_buffer(chan_id, ichan->active_buffer); + if (likely(sgnew) && + ipu_submit_buffer(ichan, descnew, sgnew, ichan->active_buffer) < 0) { + callback = desc->txd.callback; + callback_param = desc->txd.callback_param; + spin_unlock(&ichan->lock); + callback(callback_param); + spin_lock(&ichan->lock); } /* Flip the active buffer - even if update above failed */ @@ -1329,13 +1412,20 @@ static void ipu_gc_tasklet(unsigned long arg) struct idmac_channel *ichan = ipu->channel + i; struct idmac_tx_desc *desc; unsigned long flags; - int j; + struct scatterlist *sg; + int j, k; for (j = 0; j < ichan->n_tx_desc; j++) { desc = ichan->desc + j; spin_lock_irqsave(&ichan->lock, flags); if (async_tx_test_ack(&desc->txd)) { list_move(&desc->list, &ichan->free_list); + for_each_sg(desc->sg, sg, desc->sg_len, k) { + if (ichan->sg[0] == sg) + ichan->sg[0] = NULL; + else if (ichan->sg[1] == sg) + ichan->sg[1] = NULL; + } async_tx_clear_ack(&desc->txd); } spin_unlock_irqrestore(&ichan->lock, flags); @@ -1471,15 +1561,22 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan) goto eimap; ichan->eof_irq = ret; - ret = request_irq(ichan->eof_irq, idmac_interrupt, 0, - ichan->eof_name, ichan); - if (ret < 0) - goto erirq; + + /* + * Important to first disable the channel, because maybe someone + * used it before us, e.g., the bootloader + */ + ipu_disable_channel(idmac, ichan, true); ret = ipu_init_channel(idmac, ichan); if (ret < 0) goto eichan; + ret = request_irq(ichan->eof_irq, idmac_interrupt, 0, + ichan->eof_name, ichan); + if (ret < 0) + goto erirq; + ichan->status = IPU_CHANNEL_INITIALIZED; dev_dbg(&ichan->dma_chan.dev->device, "Found channel 0x%x, irq %d\n", @@ -1487,9 +1584,9 @@ static int idmac_alloc_chan_resources(struct dma_chan *chan) return ret; -eichan: - free_irq(ichan->eof_irq, ichan); erirq: + ipu_uninit_channel(idmac, ichan); +eichan: ipu_irq_unmap(ichan->dma_chan.chan_id); eimap: return ret; -- cgit v1.2.3-59-g8ed1b From ccccce229c633a92c42cd1a40c0738d7b0d12644 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:24 -0700 Subject: dmaengine: initialize tx_list in dma_async_tx_descriptor_init Centralize this common initialization (and one case where ipu_idmac is duplicating ->chan initialization). Signed-off-by: Dan Williams --- drivers/dma/dmaengine.c | 1 + drivers/dma/dw_dmac.c | 1 - drivers/dma/fsldma.c | 1 - drivers/dma/ioat_dma.c | 1 - drivers/dma/iop-adma.c | 1 - drivers/dma/ipu/ipu_idmac.c | 2 -- drivers/dma/mv_xor.c | 1 - 7 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 49243d14b894..a41d1ea10fa3 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -920,6 +920,7 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, { tx->chan = chan; spin_lock_init(&tx->lock); + INIT_LIST_HEAD(&tx->tx_list); } EXPORT_SYMBOL(dma_async_tx_descriptor_init); diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index a97c07eef7ec..862fc9ce9d86 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -826,7 +826,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) dma_async_tx_descriptor_init(&desc->txd, chan); desc->txd.tx_submit = dwc_tx_submit; desc->txd.flags = DMA_CTRL_ACK; - INIT_LIST_HEAD(&desc->txd.tx_list); desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli, sizeof(desc->lli), DMA_TO_DEVICE); dwc_desc_put(dwc, desc); diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 86d6da47f558..da8a8ed9e411 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -354,7 +354,6 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor( dma_async_tx_descriptor_init(&desc_sw->async_tx, &fsl_chan->common); desc_sw->async_tx.tx_submit = fsl_dma_tx_submit; - INIT_LIST_HEAD(&desc_sw->async_tx.tx_list); desc_sw->async_tx.phys = pdesc; } diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 5905cd36bcd2..e4fc33c1c32f 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -693,7 +693,6 @@ static struct ioat_desc_sw *ioat_dma_alloc_descriptor( desc_sw->async_tx.tx_submit = ioat2_tx_submit; break; } - INIT_LIST_HEAD(&desc_sw->async_tx.tx_list); desc_sw->hw = desc; desc_sw->async_tx.phys = phys; diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 16adbe61cfb2..2f052265122f 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -498,7 +498,6 @@ static int iop_adma_alloc_chan_resources(struct dma_chan *chan) slot->async_tx.tx_submit = iop_adma_tx_submit; INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); - INIT_LIST_HEAD(&slot->async_tx.tx_list); hw_desc = (char *) iop_chan->device->dma_desc_pool; slot->async_tx.phys = (dma_addr_t) &hw_desc[idx * IOP_ADMA_SLOT_SIZE]; diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index 0a525c450f24..7ce0e25266dc 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -983,8 +983,6 @@ static int idmac_desc_alloc(struct idmac_channel *ichan, int n) memset(txd, 0, sizeof(*txd)); dma_async_tx_descriptor_init(txd, &ichan->dma_chan); txd->tx_submit = idmac_tx_submit; - txd->chan = &ichan->dma_chan; - INIT_LIST_HEAD(&txd->tx_list); list_add(&desc->list, &ichan->free_list); diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index cb7f26fb9f18..ddab94f51224 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -632,7 +632,6 @@ static int mv_xor_alloc_chan_resources(struct dma_chan *chan) slot->async_tx.tx_submit = mv_xor_tx_submit; INIT_LIST_HEAD(&slot->chain_node); INIT_LIST_HEAD(&slot->slot_node); - INIT_LIST_HEAD(&slot->async_tx.tx_list); hw_desc = (char *) mv_chan->device->dma_desc_pool; slot->async_tx.phys = (dma_addr_t) &hw_desc[idx * MV_XOR_SLOT_SIZE]; -- cgit v1.2.3-59-g8ed1b From 54aee6a5f560d0e1bf3f39987c6ebe06daeb0ce1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:24 -0700 Subject: dmaengine: kill some unused headers The dmaengine redux left some unneeded headers in include/linux/dmaengine.h, clean them up. Signed-off-by: Dan Williams --- include/linux/dmaengine.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 1956c8d46d32..96e676e5bf9b 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -23,9 +23,6 @@ #include #include -#include -#include -#include #include /** -- cgit v1.2.3-59-g8ed1b From 06164f3194e01ea4c76941ac60f541d656c8975f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:25 -0700 Subject: async_tx: provide __async_inline for HAS_DMA=n archs To allow an async_tx routine to be compiled away on HAS_DMA=n arch it needs to be declared __always_inline otherwise the compiler may emit code and cause a link error. Signed-off-by: Dan Williams --- crypto/async_tx/async_xor.c | 7 ++----- include/linux/async_tx.h | 9 +++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/crypto/async_tx/async_xor.c b/crypto/async_tx/async_xor.c index 595b78672b36..95fe2c8d6c51 100644 --- a/crypto/async_tx/async_xor.c +++ b/crypto/async_tx/async_xor.c @@ -30,11 +30,8 @@ #include #include -/* do_async_xor - dma map the pages and perform the xor with an engine. - * This routine is marked __always_inline so it can be compiled away - * when CONFIG_DMA_ENGINE=n - */ -static __always_inline struct dma_async_tx_descriptor * +/* do_async_xor - dma map the pages and perform the xor with an engine */ +static __async_inline struct dma_async_tx_descriptor * do_async_xor(struct dma_chan *chan, struct page *dest, struct page **src_list, unsigned int offset, int src_cnt, size_t len, enum async_tx_flags flags, diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 45f6297821bd..5fc2ef8d97fa 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -21,6 +21,15 @@ #include #include +/* on architectures without dma-mapping capabilities we need to ensure + * that the asynchronous path compiles away + */ +#ifdef CONFIG_HAS_DMA +#define __async_inline +#else +#define __async_inline __always_inline +#endif + /** * dma_chan_ref - object used to manage dma channels received from the * dmaengine core. -- cgit v1.2.3-59-g8ed1b From 729b5d1b8ec72c28e99840b3f300ba67726e3ab9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:25 -0700 Subject: dmaengine: allow dma support for async_tx to be toggled Provide a config option for blocking the allocation of dma channels to the async_tx api. Signed-off-by: Dan Williams --- crypto/async_tx/async_tx.c | 6 +++--- drivers/dma/Kconfig | 11 +++++++++++ include/linux/dmaengine.h | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index f21147f3626a..06eb6cc09fef 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -30,7 +30,7 @@ #ifdef CONFIG_DMA_ENGINE static int __init async_tx_init(void) { - dmaengine_get(); + async_dmaengine_get(); printk(KERN_INFO "async_tx: api initialized (async)\n"); @@ -39,7 +39,7 @@ static int __init async_tx_init(void) static void __exit async_tx_exit(void) { - dmaengine_put(); + async_dmaengine_put(); } /** @@ -56,7 +56,7 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, if (depend_tx && dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) return depend_tx->chan; - return dma_find_channel(tx_type); + return async_dma_find_channel(tx_type); } EXPORT_SYMBOL_GPL(__async_tx_find_channel); #else diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 48ea59e79672..3b3c01b6f1ee 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -98,6 +98,17 @@ config NET_DMA Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise say N. +config ASYNC_TX_DMA + bool "Async_tx: Offload support for the async_tx api" + depends on DMA_ENGINE + help + This allows the async_tx api to take advantage of offload engines for + memcpy, memset, xor, and raid6 p+q operations. If your platform has + a dma engine that can perform raid operations and you have enabled + MD_RAID456 say Y. + + If unsure, say N. + config DMATEST tristate "DMA Test client" depends on DMA_ENGINE diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 96e676e5bf9b..2afc2c95e42d 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -288,6 +288,24 @@ static inline void net_dmaengine_put(void) } #endif +#ifdef CONFIG_ASYNC_TX_DMA +#define async_dmaengine_get() dmaengine_get() +#define async_dmaengine_put() dmaengine_put() +#define async_dma_find_channel(type) dma_find_channel(type) +#else +static inline void async_dmaengine_get(void) +{ +} +static inline void async_dmaengine_put(void) +{ +} +static inline struct dma_chan * +async_dma_find_channel(enum dma_transaction_type type) +{ + return NULL; +} +#endif + dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, -- cgit v1.2.3-59-g8ed1b From b54d5cb9156868fb4f27ccd46a3afb0bf3ef8e0c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:25 -0700 Subject: dmatest: add xor test Extend dmatest to launch a thread per supported operation type and add an xor test. Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 274 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 189 insertions(+), 85 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index e190d8b30700..224acf478fec 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO); MODULE_PARM_DESC(max_channels, "Maximum number of channels to use (default: all)"); +static unsigned int xor_sources = 3; +module_param(xor_sources, uint, S_IRUGO); +MODULE_PARM_DESC(xor_sources, + "Number of xor source buffers (default: 3)"); + /* * Initialization patterns. All bytes in the source buffer has bit 7 * set, all bytes in the destination buffer has bit 7 cleared. @@ -59,8 +64,9 @@ struct dmatest_thread { struct list_head node; struct task_struct *task; struct dma_chan *chan; - u8 *srcbuf; - u8 *dstbuf; + u8 **srcs; + u8 **dsts; + enum dma_transaction_type type; }; struct dmatest_chan { @@ -98,30 +104,37 @@ static unsigned long dmatest_random(void) return buf; } -static void dmatest_init_srcbuf(u8 *buf, unsigned int start, unsigned int len) +static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len) { unsigned int i; - - for (i = 0; i < start; i++) - buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); - for ( ; i < start + len; i++) - buf[i] = PATTERN_SRC | PATTERN_COPY - | (~i & PATTERN_COUNT_MASK);; - for ( ; i < test_buf_size; i++) - buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + u8 *buf; + + for (; (buf = *bufs); bufs++) { + for (i = 0; i < start; i++) + buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + for ( ; i < start + len; i++) + buf[i] = PATTERN_SRC | PATTERN_COPY + | (~i & PATTERN_COUNT_MASK);; + for ( ; i < test_buf_size; i++) + buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); + buf++; + } } -static void dmatest_init_dstbuf(u8 *buf, unsigned int start, unsigned int len) +static void dmatest_init_dsts(u8 **bufs, unsigned int start, unsigned int len) { unsigned int i; - - for (i = 0; i < start; i++) - buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); - for ( ; i < start + len; i++) - buf[i] = PATTERN_DST | PATTERN_OVERWRITE - | (~i & PATTERN_COUNT_MASK); - for ( ; i < test_buf_size; i++) - buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + u8 *buf; + + for (; (buf = *bufs); bufs++) { + for (i = 0; i < start; i++) + buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + for ( ; i < start + len; i++) + buf[i] = PATTERN_DST | PATTERN_OVERWRITE + | (~i & PATTERN_COUNT_MASK); + for ( ; i < test_buf_size; i++) + buf[i] = PATTERN_DST | (~i & PATTERN_COUNT_MASK); + } } static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index, @@ -150,23 +163,30 @@ static void dmatest_mismatch(u8 actual, u8 pattern, unsigned int index, thread_name, index, expected, actual); } -static unsigned int dmatest_verify(u8 *buf, unsigned int start, +static unsigned int dmatest_verify(u8 **bufs, unsigned int start, unsigned int end, unsigned int counter, u8 pattern, bool is_srcbuf) { unsigned int i; unsigned int error_count = 0; u8 actual; - - for (i = start; i < end; i++) { - actual = buf[i]; - if (actual != (pattern | (~counter & PATTERN_COUNT_MASK))) { - if (error_count < 32) - dmatest_mismatch(actual, pattern, i, counter, - is_srcbuf); - error_count++; + u8 expected; + u8 *buf; + unsigned int counter_orig = counter; + + for (; (buf = *bufs); bufs++) { + counter = counter_orig; + for (i = start; i < end; i++) { + actual = buf[i]; + expected = pattern | (~counter & PATTERN_COUNT_MASK); + if (actual != expected) { + if (error_count < 32) + dmatest_mismatch(actual, pattern, i, + counter, is_srcbuf); + error_count++; + } + counter++; } - counter++; } if (error_count > 32) @@ -178,10 +198,10 @@ static unsigned int dmatest_verify(u8 *buf, unsigned int start, /* * This function repeatedly tests DMA transfers of various lengths and - * offsets until it is told to exit by kthread_stop(). There may be - * multiple threads running this function in parallel for a single - * channel, and there may be multiple channels being tested in - * parallel. + * offsets for a given operation type until it is told to exit by + * kthread_stop(). There may be multiple threads running this function + * in parallel for a single channel, and there may be multiple channels + * being tested in parallel. * * Before each test, the source and destination buffer is initialized * with a known pattern. This pattern is different depending on @@ -201,25 +221,53 @@ static int dmatest_func(void *data) unsigned int total_tests = 0; dma_cookie_t cookie; enum dma_status status; + enum dma_ctrl_flags flags; int ret; + int src_cnt; + int dst_cnt; + int i; thread_name = current->comm; ret = -ENOMEM; - thread->srcbuf = kmalloc(test_buf_size, GFP_KERNEL); - if (!thread->srcbuf) - goto err_srcbuf; - thread->dstbuf = kmalloc(test_buf_size, GFP_KERNEL); - if (!thread->dstbuf) - goto err_dstbuf; smp_rmb(); chan = thread->chan; + if (thread->type == DMA_MEMCPY) + src_cnt = dst_cnt = 1; + else if (thread->type == DMA_XOR) { + src_cnt = xor_sources | 1; /* force odd to ensure dst = src */ + dst_cnt = 1; + } else + goto err_srcs; + + thread->srcs = kcalloc(src_cnt+1, sizeof(u8 *), GFP_KERNEL); + if (!thread->srcs) + goto err_srcs; + for (i = 0; i < src_cnt; i++) { + thread->srcs[i] = kmalloc(test_buf_size, GFP_KERNEL); + if (!thread->srcs[i]) + goto err_srcbuf; + } + thread->srcs[i] = NULL; + + thread->dsts = kcalloc(dst_cnt+1, sizeof(u8 *), GFP_KERNEL); + if (!thread->dsts) + goto err_dsts; + for (i = 0; i < dst_cnt; i++) { + thread->dsts[i] = kmalloc(test_buf_size, GFP_KERNEL); + if (!thread->dsts[i]) + goto err_dstbuf; + } + thread->dsts[i] = NULL; + + flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP; while (!kthread_should_stop()) { struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - dma_addr_t dma_src, dma_dest; + struct dma_async_tx_descriptor *tx = NULL; + dma_addr_t dma_srcs[src_cnt]; + dma_addr_t dma_dsts[dst_cnt]; total_tests++; @@ -227,22 +275,41 @@ static int dmatest_func(void *data) src_off = dmatest_random() % (test_buf_size - len + 1); dst_off = dmatest_random() % (test_buf_size - len + 1); - dmatest_init_srcbuf(thread->srcbuf, src_off, len); - dmatest_init_dstbuf(thread->dstbuf, dst_off, len); + dmatest_init_srcs(thread->srcs, src_off, len); + dmatest_init_dsts(thread->dsts, dst_off, len); + + for (i = 0; i < src_cnt; i++) { + u8 *buf = thread->srcs[i] + src_off; - dma_src = dma_map_single(dev->dev, thread->srcbuf + src_off, - len, DMA_TO_DEVICE); + dma_srcs[i] = dma_map_single(dev->dev, buf, len, + DMA_TO_DEVICE); + } /* map with DMA_BIDIRECTIONAL to force writeback/invalidate */ - dma_dest = dma_map_single(dev->dev, thread->dstbuf, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < dst_cnt; i++) { + dma_dsts[i] = dma_map_single(dev->dev, thread->dsts[i], + test_buf_size, + DMA_BIDIRECTIONAL); + } + + if (thread->type == DMA_MEMCPY) + tx = dev->device_prep_dma_memcpy(chan, + dma_dsts[0] + dst_off, + dma_srcs[0], len, + flags); + else if (thread->type == DMA_XOR) + tx = dev->device_prep_dma_xor(chan, + dma_dsts[0] + dst_off, + dma_srcs, xor_sources, + len, flags); - tx = dev->device_prep_dma_memcpy(chan, dma_dest + dst_off, - dma_src, len, - DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP); if (!tx) { - dma_unmap_single(dev->dev, dma_src, len, DMA_TO_DEVICE); - dma_unmap_single(dev->dev, dma_dest, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < src_cnt; i++) + dma_unmap_single(dev->dev, dma_srcs[i], len, + DMA_TO_DEVICE); + for (i = 0; i < dst_cnt; i++) + dma_unmap_single(dev->dev, dma_dsts[i], + test_buf_size, + DMA_BIDIRECTIONAL); pr_warning("%s: #%u: prep error with src_off=0x%x " "dst_off=0x%x len=0x%x\n", thread_name, total_tests - 1, @@ -263,11 +330,11 @@ static int dmatest_func(void *data) failed_tests++; continue; } - dma_async_memcpy_issue_pending(chan); + dma_async_issue_pending(chan); do { msleep(1); - status = dma_async_memcpy_complete( + status = dma_async_is_tx_complete( chan, cookie, NULL, NULL); } while (status == DMA_IN_PROGRESS); @@ -278,29 +345,30 @@ static int dmatest_func(void *data) continue; } /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */ - dma_unmap_single(dev->dev, dma_dest, - test_buf_size, DMA_BIDIRECTIONAL); + for (i = 0; i < dst_cnt; i++) + dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size, + DMA_BIDIRECTIONAL); error_count = 0; pr_debug("%s: verifying source buffer...\n", thread_name); - error_count += dmatest_verify(thread->srcbuf, 0, src_off, + error_count += dmatest_verify(thread->srcs, 0, src_off, 0, PATTERN_SRC, true); - error_count += dmatest_verify(thread->srcbuf, src_off, + error_count += dmatest_verify(thread->srcs, src_off, src_off + len, src_off, PATTERN_SRC | PATTERN_COPY, true); - error_count += dmatest_verify(thread->srcbuf, src_off + len, + error_count += dmatest_verify(thread->srcs, src_off + len, test_buf_size, src_off + len, PATTERN_SRC, true); pr_debug("%s: verifying dest buffer...\n", thread->task->comm); - error_count += dmatest_verify(thread->dstbuf, 0, dst_off, + error_count += dmatest_verify(thread->dsts, 0, dst_off, 0, PATTERN_DST, false); - error_count += dmatest_verify(thread->dstbuf, dst_off, + error_count += dmatest_verify(thread->dsts, dst_off, dst_off + len, src_off, PATTERN_SRC | PATTERN_COPY, false); - error_count += dmatest_verify(thread->dstbuf, dst_off + len, + error_count += dmatest_verify(thread->dsts, dst_off + len, test_buf_size, dst_off + len, PATTERN_DST, false); @@ -319,10 +387,16 @@ static int dmatest_func(void *data) } ret = 0; - kfree(thread->dstbuf); + for (i = 0; thread->dsts[i]; i++) + kfree(thread->dsts[i]); err_dstbuf: - kfree(thread->srcbuf); + kfree(thread->dsts); +err_dsts: + for (i = 0; thread->srcs[i]; i++) + kfree(thread->srcs[i]); err_srcbuf: + kfree(thread->srcs); +err_srcs: pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", thread_name, total_tests, failed_tests, ret); return ret; @@ -344,35 +418,36 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc) kfree(dtc); } -static int dmatest_add_channel(struct dma_chan *chan) +static int dmatest_add_threads(struct dmatest_chan *dtc, enum dma_transaction_type type) { - struct dmatest_chan *dtc; - struct dmatest_thread *thread; - unsigned int i; - - dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); - if (!dtc) { - pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); - return -ENOMEM; - } + struct dmatest_thread *thread; + struct dma_chan *chan = dtc->chan; + char *op; + unsigned int i; - dtc->chan = chan; - INIT_LIST_HEAD(&dtc->threads); + if (type == DMA_MEMCPY) + op = "copy"; + else if (type == DMA_XOR) + op = "xor"; + else + return -EINVAL; for (i = 0; i < threads_per_chan; i++) { thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL); if (!thread) { - pr_warning("dmatest: No memory for %s-test%u\n", - dma_chan_name(chan), i); + pr_warning("dmatest: No memory for %s-%s%u\n", + dma_chan_name(chan), op, i); + break; } thread->chan = dtc->chan; + thread->type = type; smp_wmb(); - thread->task = kthread_run(dmatest_func, thread, "%s-test%u", - dma_chan_name(chan), i); + thread->task = kthread_run(dmatest_func, thread, "%s-%s%u", + dma_chan_name(chan), op, i); if (IS_ERR(thread->task)) { - pr_warning("dmatest: Failed to run thread %s-test%u\n", - dma_chan_name(chan), i); + pr_warning("dmatest: Failed to run thread %s-%s%u\n", + dma_chan_name(chan), op, i); kfree(thread); break; } @@ -382,7 +457,36 @@ static int dmatest_add_channel(struct dma_chan *chan) list_add_tail(&thread->node, &dtc->threads); } - pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan)); + return i; +} + +static int dmatest_add_channel(struct dma_chan *chan) +{ + struct dmatest_chan *dtc; + struct dma_device *dma_dev = chan->device; + unsigned int thread_count = 0; + unsigned int cnt; + + dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); + if (!dtc) { + pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); + return -ENOMEM; + } + + dtc->chan = chan; + INIT_LIST_HEAD(&dtc->threads); + + if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { + cnt = dmatest_add_threads(dtc, DMA_MEMCPY); + thread_count += cnt > 0 ?: 0; + } + if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { + cnt = dmatest_add_threads(dtc, DMA_XOR); + thread_count += cnt > 0 ?: 0; + } + + pr_info("dmatest: Started %u threads using %s\n", + thread_count, dma_chan_name(chan)); list_add_tail(&dtc->node, &dmatest_channels); nr_channels++; -- cgit v1.2.3-59-g8ed1b From e44e0aa3cfa97cddff01704751a4b25151830c72 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 25 Mar 2009 09:13:25 -0700 Subject: dmatest: add dma interrupts and callbacks Use the callback infrastructure to report driver/hardware hangs or missed interrupts. Since this makes the test threads much more aggressive (from: explicit 1ms sleep to: wait_for_completion) we set the nice value to 10 so as to not swamp legitimate tasks. Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 224acf478fec..a27c0fb1bc11 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -196,6 +196,11 @@ static unsigned int dmatest_verify(u8 **bufs, unsigned int start, return error_count; } +static void dmatest_callback(void *completion) +{ + complete(completion); +} + /* * This function repeatedly tests DMA transfers of various lengths and * offsets for a given operation type until it is told to exit by @@ -261,13 +266,17 @@ static int dmatest_func(void *data) } thread->dsts[i] = NULL; - flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP; + set_user_nice(current, 10); + + flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT; while (!kthread_should_stop()) { struct dma_device *dev = chan->device; struct dma_async_tx_descriptor *tx = NULL; dma_addr_t dma_srcs[src_cnt]; dma_addr_t dma_dsts[dst_cnt]; + struct completion cmp; + unsigned long tmo = msecs_to_jiffies(3000); total_tests++; @@ -318,7 +327,10 @@ static int dmatest_func(void *data) failed_tests++; continue; } - tx->callback = NULL; + + init_completion(&cmp); + tx->callback = dmatest_callback; + tx->callback_param = &cmp; cookie = tx->tx_submit(tx); if (dma_submit_error(cookie)) { @@ -332,18 +344,23 @@ static int dmatest_func(void *data) } dma_async_issue_pending(chan); - do { - msleep(1); - status = dma_async_is_tx_complete( - chan, cookie, NULL, NULL); - } while (status == DMA_IN_PROGRESS); + tmo = wait_for_completion_timeout(&cmp, tmo); + status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (status == DMA_ERROR) { - pr_warning("%s: #%u: error during copy\n", - thread_name, total_tests - 1); + if (tmo == 0) { + pr_warning("%s: #%u: test timed out\n", + thread_name, total_tests - 1); + failed_tests++; + continue; + } else if (status != DMA_SUCCESS) { + pr_warning("%s: #%u: got completion callback," + " but status is \'%s\'\n", + thread_name, total_tests - 1, + status == DMA_ERROR ? "error" : "in progress"); failed_tests++; continue; } + /* Unmap by myself (see DMA_COMPL_SKIP_DEST_UNMAP above) */ for (i = 0; i < dst_cnt; i++) dma_unmap_single(dev->dev, dma_dsts[i], test_buf_size, -- cgit v1.2.3-59-g8ed1b From de18836e447c2dc30120c0919b8db8ddc0401cc4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2009 17:33:38 +0100 Subject: genirq: fix devres.o build for GENERIC_HARDIRQS=n kernel/irq/devres.c is built by sparc (32bit) and m68k via the obscure ../../../kernel/irq/devres.o reference in arch/[sparc/m68k]/kernel/Makefile To avoid ifdeffery in devres.c provide request_threaded_irq as an inline for these users. Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 7e63b824833f..143192f48bf3 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -123,6 +123,20 @@ extern int __must_check request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev); +/* + * Special function to avoid ifdeffery in kernel/irq/devres.c which + * gets magically built by GENERIC_HARDIRQS=n architectures (sparc, + * m68k). I really love these $@%#!* obvious Makefile references: + * ../../../kernel/irq/devres.o + */ +static inline int __must_check +request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, + unsigned long flags, const char *name, void *dev) +{ + return request_irq(irq, handler, flags, name, dev); +} + static inline void exit_irq_thread(void) { } #endif -- cgit v1.2.3-59-g8ed1b From 0361a28d3f9a4315a100c7b37ba0b55cfe15fe07 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 17 Dec 2008 15:38:03 +0100 Subject: HID: autosuspend support for USB HID This uses the USB busy mechanism for aggessive autosuspend of USB HID devices. It autosuspends all opened devices supporting remote wakeup after a timeout unless - output is being done to the device - a key is being held down (remote wakeup isn't triggered upon key release) - LED(s) are lit - hiddev is opened As in the current driver closed devices will be autosuspended even if they don't support remote wakeup. The patch is quite large because output to devices is done in hard interrupt context meaning a lot a queuing and locking had to be touched. The LED stuff has been solved by means of a simple counter. Additions to the generic HID code could be avoided. In addition it now covers hidraw. It contains an embryonic version of an API to let the generic HID code tell the lower levels which capabilities with respect to power management are needed. Signed-off-by: Oliver Neukum Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 16 ++ drivers/hid/hidraw.c | 17 +- drivers/hid/usbhid/hid-core.c | 435 +++++++++++++++++++++++++++++++++--------- drivers/hid/usbhid/hiddev.c | 17 +- drivers/hid/usbhid/usbhid.h | 14 +- include/linux/hid.h | 6 + 6 files changed, 402 insertions(+), 103 deletions(-) diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 1cc967448f4d..feaeb6167ea4 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -1822,6 +1822,22 @@ static DECLARE_WORK(hid_compat_work, hid_compat_load); static struct workqueue_struct *hid_compat_wq; #endif +int hid_check_keys_pressed(struct hid_device *hid) +{ + struct hid_input *hidinput; + int i; + + list_for_each_entry(hidinput, &hid->inputs, list) { + for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++) + if (hidinput->input->key[i]) + return 1; + } + + return 0; +} + +EXPORT_SYMBOL_GPL(hid_check_keys_pressed); + static int __init hid_init(void) { int ret; diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index 02b19db5442e..e263d4731179 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -181,9 +181,17 @@ static int hidraw_open(struct inode *inode, struct file *file) dev = hidraw_table[minor]; if (!dev->open++) { + if (dev->hid->ll_driver->power) { + err = dev->hid->ll_driver->power(dev->hid, PM_HINT_FULLON); + if (err < 0) + goto out_unlock; + } err = dev->hid->ll_driver->open(dev->hid); - if (err < 0) + if (err < 0) { + if (dev->hid->ll_driver->power) + dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL); dev->open--; + } } out_unlock: @@ -209,10 +217,13 @@ static int hidraw_release(struct inode * inode, struct file * file) list_del(&list->node); dev = hidraw_table[minor]; if (!--dev->open) { - if (list->hidraw->exist) + if (list->hidraw->exist) { + if (dev->hid->ll_driver->power) + dev->hid->ll_driver->power(dev->hid, PM_HINT_NORMAL); dev->hid->ll_driver->close(dev->hid); - else + } else { kfree(list->hidraw); + } } kfree(list); diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index f0a0f72238ab..625e7e8eb373 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -5,6 +5,7 @@ * Copyright (c) 2000-2005 Vojtech Pavlik * Copyright (c) 2005 Michael Haboustak for Concept2, Inc * Copyright (c) 2006-2008 Jiri Kosina + * Copyright (c) 2007-2008 Oliver Neukum */ /* @@ -27,6 +28,7 @@ #include #include #include +#include #include @@ -53,6 +55,10 @@ static unsigned int hid_mousepoll_interval; module_param_named(mousepoll, hid_mousepoll_interval, uint, 0644); MODULE_PARM_DESC(mousepoll, "Polling interval of mice"); +static unsigned int ignoreled; +module_param_named(ignoreled, ignoreled, uint, 0644); +MODULE_PARM_DESC(ignoreled, "Autosuspend with active leds"); + /* Quirks specified at module load time */ static char *quirks_param[MAX_USBHID_BOOT_QUIRKS] = { [ 0 ... (MAX_USBHID_BOOT_QUIRKS - 1) ] = NULL }; module_param_array_named(quirks, quirks_param, charp, NULL, 0444); @@ -63,8 +69,13 @@ MODULE_PARM_DESC(quirks, "Add/modify USB HID quirks by specifying " /* * Input submission and I/O error handler. */ +static DEFINE_MUTEX(hid_open_mut); +static struct workqueue_struct *resumption_waker; static void hid_io_error(struct hid_device *hid); +static int hid_submit_out(struct hid_device *hid); +static int hid_submit_ctrl(struct hid_device *hid); +static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid); /* Start up the input URB */ static int hid_start_in(struct hid_device *hid) @@ -73,15 +84,16 @@ static int hid_start_in(struct hid_device *hid) int rc = 0; struct usbhid_device *usbhid = hid->driver_data; - spin_lock_irqsave(&usbhid->inlock, flags); - if (hid->open > 0 && !test_bit(HID_SUSPENDED, &usbhid->iofl) && + spin_lock_irqsave(&usbhid->lock, flags); + if (hid->open > 0 && !test_bit(HID_DISCONNECTED, &usbhid->iofl) && + !test_bit(HID_REPORTED_IDLE, &usbhid->iofl) && !test_and_set_bit(HID_IN_RUNNING, &usbhid->iofl)) { rc = usb_submit_urb(usbhid->urbin, GFP_ATOMIC); if (rc != 0) clear_bit(HID_IN_RUNNING, &usbhid->iofl); } - spin_unlock_irqrestore(&usbhid->inlock, flags); + spin_unlock_irqrestore(&usbhid->lock, flags); return rc; } @@ -145,7 +157,7 @@ static void hid_io_error(struct hid_device *hid) unsigned long flags; struct usbhid_device *usbhid = hid->driver_data; - spin_lock_irqsave(&usbhid->inlock, flags); + spin_lock_irqsave(&usbhid->lock, flags); /* Stop when disconnected */ if (test_bit(HID_DISCONNECTED, &usbhid->iofl)) @@ -175,7 +187,51 @@ static void hid_io_error(struct hid_device *hid) mod_timer(&usbhid->io_retry, jiffies + msecs_to_jiffies(usbhid->retry_delay)); done: - spin_unlock_irqrestore(&usbhid->inlock, flags); + spin_unlock_irqrestore(&usbhid->lock, flags); +} + +static void usbhid_mark_busy(struct usbhid_device *usbhid) +{ + struct usb_interface *intf = usbhid->intf; + + usb_mark_last_busy(interface_to_usbdev(intf)); +} + +static int usbhid_restart_out_queue(struct usbhid_device *usbhid) +{ + struct hid_device *hid = usb_get_intfdata(usbhid->intf); + int kicked; + + if (!hid) + return 0; + + if ((kicked = (usbhid->outhead != usbhid->outtail))) { + dbg("Kicking head %d tail %d", usbhid->outhead, usbhid->outtail); + if (hid_submit_out(hid)) { + clear_bit(HID_OUT_RUNNING, &usbhid->iofl); + wake_up(&usbhid->wait); + } + } + return kicked; +} + +static int usbhid_restart_ctrl_queue(struct usbhid_device *usbhid) +{ + struct hid_device *hid = usb_get_intfdata(usbhid->intf); + int kicked; + + WARN_ON(hid == NULL); + if (!hid) + return 0; + + if ((kicked = (usbhid->ctrlhead != usbhid->ctrltail))) { + dbg("Kicking head %d tail %d", usbhid->ctrlhead, usbhid->ctrltail); + if (hid_submit_ctrl(hid)) { + clear_bit(HID_CTRL_RUNNING, &usbhid->iofl); + wake_up(&usbhid->wait); + } + } + return kicked; } /* @@ -190,12 +246,23 @@ static void hid_irq_in(struct urb *urb) switch (urb->status) { case 0: /* success */ + usbhid_mark_busy(usbhid); usbhid->retry_delay = 0; hid_input_report(urb->context, HID_INPUT_REPORT, urb->transfer_buffer, urb->actual_length, 1); + /* + * autosuspend refused while keys are pressed + * because most keyboards don't wake up when + * a key is released + */ + if (hid_check_keys_pressed(hid)) + set_bit(HID_KEYS_PRESSED, &usbhid->iofl); + else + clear_bit(HID_KEYS_PRESSED, &usbhid->iofl); break; case -EPIPE: /* stall */ + usbhid_mark_busy(usbhid); clear_bit(HID_IN_RUNNING, &usbhid->iofl); set_bit(HID_CLEAR_HALT, &usbhid->iofl); schedule_work(&usbhid->reset_work); @@ -209,6 +276,7 @@ static void hid_irq_in(struct urb *urb) case -EPROTO: /* protocol error or unplug */ case -ETIME: /* protocol error or unplug */ case -ETIMEDOUT: /* Should never happen, but... */ + usbhid_mark_busy(usbhid); clear_bit(HID_IN_RUNNING, &usbhid->iofl); hid_io_error(hid); return; @@ -239,16 +307,25 @@ static int hid_submit_out(struct hid_device *hid) report = usbhid->out[usbhid->outtail].report; raw_report = usbhid->out[usbhid->outtail].raw_report; - usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0); - usbhid->urbout->dev = hid_to_usb_dev(hid); - memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length); - kfree(raw_report); + if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) { + usbhid->urbout->transfer_buffer_length = ((report->size - 1) >> 3) + 1 + (report->id > 0); + usbhid->urbout->dev = hid_to_usb_dev(hid); + memcpy(usbhid->outbuf, raw_report, usbhid->urbout->transfer_buffer_length); + kfree(raw_report); - dbg_hid("submitting out urb\n"); + dbg_hid("submitting out urb\n"); - if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) { - err_hid("usb_submit_urb(out) failed"); - return -1; + if (usb_submit_urb(usbhid->urbout, GFP_ATOMIC)) { + err_hid("usb_submit_urb(out) failed"); + return -1; + } + } else { + /* + * queue work to wake up the device. + * as the work queue is freezeable, this is safe + * with respect to STD and STR + */ + queue_work(resumption_waker, &usbhid->restart_work); } return 0; @@ -266,41 +343,50 @@ static int hid_submit_ctrl(struct hid_device *hid) raw_report = usbhid->ctrl[usbhid->ctrltail].raw_report; dir = usbhid->ctrl[usbhid->ctrltail].dir; - len = ((report->size - 1) >> 3) + 1 + (report->id > 0); - if (dir == USB_DIR_OUT) { - usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0); - usbhid->urbctrl->transfer_buffer_length = len; - memcpy(usbhid->ctrlbuf, raw_report, len); - kfree(raw_report); - } else { - int maxpacket, padlen; - - usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0); - maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0); - if (maxpacket > 0) { - padlen = DIV_ROUND_UP(len, maxpacket); - padlen *= maxpacket; - if (padlen > usbhid->bufsize) - padlen = usbhid->bufsize; - } else - padlen = 0; - usbhid->urbctrl->transfer_buffer_length = padlen; - } - usbhid->urbctrl->dev = hid_to_usb_dev(hid); + if (!test_bit(HID_REPORTED_IDLE, &usbhid->iofl)) { + len = ((report->size - 1) >> 3) + 1 + (report->id > 0); + if (dir == USB_DIR_OUT) { + usbhid->urbctrl->pipe = usb_sndctrlpipe(hid_to_usb_dev(hid), 0); + usbhid->urbctrl->transfer_buffer_length = len; + memcpy(usbhid->ctrlbuf, raw_report, len); + kfree(raw_report); + } else { + int maxpacket, padlen; + + usbhid->urbctrl->pipe = usb_rcvctrlpipe(hid_to_usb_dev(hid), 0); + maxpacket = usb_maxpacket(hid_to_usb_dev(hid), usbhid->urbctrl->pipe, 0); + if (maxpacket > 0) { + padlen = DIV_ROUND_UP(len, maxpacket); + padlen *= maxpacket; + if (padlen > usbhid->bufsize) + padlen = usbhid->bufsize; + } else + padlen = 0; + usbhid->urbctrl->transfer_buffer_length = padlen; + } + usbhid->urbctrl->dev = hid_to_usb_dev(hid); - usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir; - usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT; - usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id); - usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum); - usbhid->cr->wLength = cpu_to_le16(len); + usbhid->cr->bRequestType = USB_TYPE_CLASS | USB_RECIP_INTERFACE | dir; + usbhid->cr->bRequest = (dir == USB_DIR_OUT) ? HID_REQ_SET_REPORT : HID_REQ_GET_REPORT; + usbhid->cr->wValue = cpu_to_le16(((report->type + 1) << 8) | report->id); + usbhid->cr->wIndex = cpu_to_le16(usbhid->ifnum); + usbhid->cr->wLength = cpu_to_le16(len); - dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n", - usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report", - usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength); + dbg_hid("submitting ctrl urb: %s wValue=0x%04x wIndex=0x%04x wLength=%u\n", + usbhid->cr->bRequest == HID_REQ_SET_REPORT ? "Set_Report" : "Get_Report", + usbhid->cr->wValue, usbhid->cr->wIndex, usbhid->cr->wLength); - if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) { - err_hid("usb_submit_urb(ctrl) failed"); - return -1; + if (usb_submit_urb(usbhid->urbctrl, GFP_ATOMIC)) { + err_hid("usb_submit_urb(ctrl) failed"); + return -1; + } + } else { + /* + * queue work to wake up the device. + * as the work queue is freezeable, this is safe + * with respect to STD and STR + */ + queue_work(resumption_waker, &usbhid->restart_work); } return 0; @@ -332,7 +418,7 @@ static void hid_irq_out(struct urb *urb) "received\n", urb->status); } - spin_lock_irqsave(&usbhid->outlock, flags); + spin_lock_irqsave(&usbhid->lock, flags); if (unplug) usbhid->outtail = usbhid->outhead; @@ -344,12 +430,12 @@ static void hid_irq_out(struct urb *urb) clear_bit(HID_OUT_RUNNING, &usbhid->iofl); wake_up(&usbhid->wait); } - spin_unlock_irqrestore(&usbhid->outlock, flags); + spin_unlock_irqrestore(&usbhid->lock, flags); return; } clear_bit(HID_OUT_RUNNING, &usbhid->iofl); - spin_unlock_irqrestore(&usbhid->outlock, flags); + spin_unlock_irqrestore(&usbhid->lock, flags); wake_up(&usbhid->wait); } @@ -361,12 +447,11 @@ static void hid_ctrl(struct urb *urb) { struct hid_device *hid = urb->context; struct usbhid_device *usbhid = hid->driver_data; - unsigned long flags; - int unplug = 0; + int unplug = 0, status = urb->status; - spin_lock_irqsave(&usbhid->ctrllock, flags); + spin_lock(&usbhid->lock); - switch (urb->status) { + switch (status) { case 0: /* success */ if (usbhid->ctrl[usbhid->ctrltail].dir == USB_DIR_IN) hid_input_report(urb->context, @@ -383,7 +468,7 @@ static void hid_ctrl(struct urb *urb) break; default: /* error */ dev_warn(&urb->dev->dev, "ctrl urb status %d " - "received\n", urb->status); + "received\n", status); } if (unplug) @@ -396,19 +481,18 @@ static void hid_ctrl(struct urb *urb) clear_bit(HID_CTRL_RUNNING, &usbhid->iofl); wake_up(&usbhid->wait); } - spin_unlock_irqrestore(&usbhid->ctrllock, flags); + spin_unlock(&usbhid->lock); return; } clear_bit(HID_CTRL_RUNNING, &usbhid->iofl); - spin_unlock_irqrestore(&usbhid->ctrllock, flags); + spin_unlock(&usbhid->lock); wake_up(&usbhid->wait); } -void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir) +void __usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir) { int head; - unsigned long flags; struct usbhid_device *usbhid = hid->driver_data; int len = ((report->size - 1) >> 3) + 1 + (report->id > 0); @@ -416,18 +500,13 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns return; if (usbhid->urbout && dir == USB_DIR_OUT && report->type == HID_OUTPUT_REPORT) { - - spin_lock_irqsave(&usbhid->outlock, flags); - if ((head = (usbhid->outhead + 1) & (HID_OUTPUT_FIFO_SIZE - 1)) == usbhid->outtail) { - spin_unlock_irqrestore(&usbhid->outlock, flags); dev_warn(&hid->dev, "output queue full\n"); return; } usbhid->out[usbhid->outhead].raw_report = kmalloc(len, GFP_ATOMIC); if (!usbhid->out[usbhid->outhead].raw_report) { - spin_unlock_irqrestore(&usbhid->outlock, flags); dev_warn(&hid->dev, "output queueing failed\n"); return; } @@ -438,15 +517,10 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns if (!test_and_set_bit(HID_OUT_RUNNING, &usbhid->iofl)) if (hid_submit_out(hid)) clear_bit(HID_OUT_RUNNING, &usbhid->iofl); - - spin_unlock_irqrestore(&usbhid->outlock, flags); return; } - spin_lock_irqsave(&usbhid->ctrllock, flags); - if ((head = (usbhid->ctrlhead + 1) & (HID_CONTROL_FIFO_SIZE - 1)) == usbhid->ctrltail) { - spin_unlock_irqrestore(&usbhid->ctrllock, flags); dev_warn(&hid->dev, "control queue full\n"); return; } @@ -454,7 +528,6 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns if (dir == USB_DIR_OUT) { usbhid->ctrl[usbhid->ctrlhead].raw_report = kmalloc(len, GFP_ATOMIC); if (!usbhid->ctrl[usbhid->ctrlhead].raw_report) { - spin_unlock_irqrestore(&usbhid->ctrllock, flags); dev_warn(&hid->dev, "control queueing failed\n"); return; } @@ -467,15 +540,25 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns if (!test_and_set_bit(HID_CTRL_RUNNING, &usbhid->iofl)) if (hid_submit_ctrl(hid)) clear_bit(HID_CTRL_RUNNING, &usbhid->iofl); +} - spin_unlock_irqrestore(&usbhid->ctrllock, flags); +void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir) +{ + struct usbhid_device *usbhid = hid->driver_data; + unsigned long flags; + + spin_lock_irqsave(&usbhid->lock, flags); + __usbhid_submit_report(hid, report, dir); + spin_unlock_irqrestore(&usbhid->lock, flags); } EXPORT_SYMBOL_GPL(usbhid_submit_report); static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct hid_device *hid = input_get_drvdata(dev); + struct usbhid_device *usbhid = hid->driver_data; struct hid_field *field; + unsigned long flags; int offset; if (type == EV_FF) @@ -490,6 +573,15 @@ static int usb_hidinput_input_event(struct input_dev *dev, unsigned int type, un } hid_set_field(field, offset, value); + if (value) { + spin_lock_irqsave(&usbhid->lock, flags); + usbhid->ledcount++; + spin_unlock_irqrestore(&usbhid->lock, flags); + } else { + spin_lock_irqsave(&usbhid->lock, flags); + usbhid->ledcount--; + spin_unlock_irqrestore(&usbhid->lock, flags); + } usbhid_submit_report(hid, field->report, USB_DIR_OUT); return 0; @@ -538,15 +630,22 @@ int usbhid_open(struct hid_device *hid) struct usbhid_device *usbhid = hid->driver_data; int res; + mutex_lock(&hid_open_mut); if (!hid->open++) { res = usb_autopm_get_interface(usbhid->intf); + /* the device must be awake to reliable request remote wakeup */ if (res < 0) { hid->open--; + mutex_unlock(&hid_open_mut); return -EIO; } + usbhid->intf->needs_remote_wakeup = 1; + if (hid_start_in(hid)) + hid_io_error(hid); + + usb_autopm_put_interface(usbhid->intf); } - if (hid_start_in(hid)) - hid_io_error(hid); + mutex_unlock(&hid_open_mut); return 0; } @@ -554,10 +653,22 @@ void usbhid_close(struct hid_device *hid) { struct usbhid_device *usbhid = hid->driver_data; + mutex_lock(&hid_open_mut); + + /* protecting hid->open to make sure we don't restart + * data acquistion due to a resumption we no longer + * care about + */ + spin_lock_irq(&usbhid->lock); if (!--hid->open) { + spin_unlock_irq(&usbhid->lock); usb_kill_urb(usbhid->urbin); - usb_autopm_put_interface(usbhid->intf); + flush_scheduled_work(); + usbhid->intf->needs_remote_wakeup = 0; + } else { + spin_unlock_irq(&usbhid->lock); } + mutex_unlock(&hid_open_mut); } /* @@ -687,6 +798,25 @@ static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t co return ret; } +static void usbhid_restart_queues(struct usbhid_device *usbhid) +{ + if (usbhid->urbout) + usbhid_restart_out_queue(usbhid); + usbhid_restart_ctrl_queue(usbhid); +} + +static void __usbhid_restart_queues(struct work_struct *work) +{ + struct usbhid_device *usbhid = + container_of(work, struct usbhid_device, restart_work); + int r; + + r = usb_autopm_get_interface(usbhid->intf); + if (r < 0) + return; + usb_autopm_put_interface(usbhid->intf); +} + static void hid_free_buffers(struct usb_device *dev, struct hid_device *hid) { struct usbhid_device *usbhid = hid->driver_data; @@ -850,11 +980,11 @@ static int usbhid_start(struct hid_device *hid) init_waitqueue_head(&usbhid->wait); INIT_WORK(&usbhid->reset_work, hid_reset); + INIT_WORK(&usbhid->restart_work, __usbhid_restart_queues); setup_timer(&usbhid->io_retry, hid_retry_timeout, (unsigned long) hid); - spin_lock_init(&usbhid->inlock); - spin_lock_init(&usbhid->outlock); - spin_lock_init(&usbhid->ctrllock); + spin_lock_init(&usbhid->lock); + spin_lock_init(&usbhid->lock); usbhid->intf = intf; usbhid->ifnum = interface->desc.bInterfaceNumber; @@ -906,15 +1036,14 @@ static void usbhid_stop(struct hid_device *hid) return; clear_bit(HID_STARTED, &usbhid->iofl); - spin_lock_irq(&usbhid->inlock); /* Sync with error handler */ + spin_lock_irq(&usbhid->lock); /* Sync with error handler */ set_bit(HID_DISCONNECTED, &usbhid->iofl); - spin_unlock_irq(&usbhid->inlock); + spin_unlock_irq(&usbhid->lock); usb_kill_urb(usbhid->urbin); usb_kill_urb(usbhid->urbout); usb_kill_urb(usbhid->urbctrl); - del_timer_sync(&usbhid->io_retry); - cancel_work_sync(&usbhid->reset_work); + hid_cancel_delayed_stuff(usbhid); if (hid->claimed & HID_CLAIMED_INPUT) hidinput_disconnect(hid); @@ -935,12 +1064,28 @@ static void usbhid_stop(struct hid_device *hid) hid_free_buffers(hid_to_usb_dev(hid), hid); } +static int usbhid_power(struct hid_device *hid, int lvl) +{ + int r = 0; + + switch (lvl) { + case PM_HINT_FULLON: + r = usbhid_get_power(hid); + break; + case PM_HINT_NORMAL: + usbhid_put_power(hid); + break; + } + return r; +} + static struct hid_ll_driver usb_hid_driver = { .parse = usbhid_parse, .start = usbhid_start, .stop = usbhid_stop, .open = usbhid_open, .close = usbhid_close, + .power = usbhid_power, .hidinput_input_event = usb_hidinput_input_event, }; @@ -1049,19 +1194,75 @@ static void hid_disconnect(struct usb_interface *intf) kfree(usbhid); } +static void hid_cancel_delayed_stuff(struct usbhid_device *usbhid) +{ + del_timer_sync(&usbhid->io_retry); + cancel_work_sync(&usbhid->restart_work); + cancel_work_sync(&usbhid->reset_work); +} + +static void hid_cease_io(struct usbhid_device *usbhid) +{ + del_timer(&usbhid->io_retry); + usb_kill_urb(usbhid->urbin); + usb_kill_urb(usbhid->urbctrl); + usb_kill_urb(usbhid->urbout); + flush_scheduled_work(); +} + static int hid_suspend(struct usb_interface *intf, pm_message_t message) { - struct hid_device *hid = usb_get_intfdata (intf); + struct hid_device *hid = usb_get_intfdata(intf); struct usbhid_device *usbhid = hid->driver_data; + struct usb_device *udev = interface_to_usbdev(intf); + int status; - if (!test_bit(HID_STARTED, &usbhid->iofl)) - return 0; + if (udev->auto_pm) { + spin_lock_irq(&usbhid->lock); /* Sync with error handler */ + if (!test_bit(HID_RESET_PENDING, &usbhid->iofl) + && !test_bit(HID_CLEAR_HALT, &usbhid->iofl) + && !test_bit(HID_OUT_RUNNING, &usbhid->iofl) + && !test_bit(HID_CTRL_RUNNING, &usbhid->iofl) + && !test_bit(HID_KEYS_PRESSED, &usbhid->iofl) + && (!usbhid->ledcount || ignoreled)) + { + set_bit(HID_REPORTED_IDLE, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); + } else { + usbhid_mark_busy(usbhid); + spin_unlock_irq(&usbhid->lock); + return -EBUSY; + } - spin_lock_irq(&usbhid->inlock); /* Sync with error handler */ - set_bit(HID_SUSPENDED, &usbhid->iofl); - spin_unlock_irq(&usbhid->inlock); - del_timer_sync(&usbhid->io_retry); - usb_kill_urb(usbhid->urbin); + } else { + spin_lock_irq(&usbhid->lock); + set_bit(HID_REPORTED_IDLE, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); + if (usbhid_wait_io(hid) < 0) + return -EIO; + } + + if (!ignoreled && udev->auto_pm) { + spin_lock_irq(&usbhid->lock); + if (test_bit(HID_LED_ON, &usbhid->iofl)) { + spin_unlock_irq(&usbhid->lock); + usbhid_mark_busy(usbhid); + return -EBUSY; + } + spin_unlock_irq(&usbhid->lock); + } + + hid_cancel_delayed_stuff(usbhid); + hid_cease_io(usbhid); + + if (udev->auto_pm && test_bit(HID_KEYS_PRESSED, &usbhid->iofl)) { + /* lost race against keypresses */ + status = hid_start_in(hid); + if (status < 0) + hid_io_error(hid); + usbhid_mark_busy(usbhid); + return -EBUSY; + } dev_dbg(&intf->dev, "suspend\n"); return 0; } @@ -1075,18 +1276,33 @@ static int hid_resume(struct usb_interface *intf) if (!test_bit(HID_STARTED, &usbhid->iofl)) return 0; - clear_bit(HID_SUSPENDED, &usbhid->iofl); + clear_bit(HID_REPORTED_IDLE, &usbhid->iofl); + usbhid_mark_busy(usbhid); + + if (test_bit(HID_CLEAR_HALT, &usbhid->iofl) || + test_bit(HID_RESET_PENDING, &usbhid->iofl)) + schedule_work(&usbhid->reset_work); usbhid->retry_delay = 0; status = hid_start_in(hid); + if (status < 0) + hid_io_error(hid); + usbhid_restart_queues(usbhid); + dev_dbg(&intf->dev, "resume status %d\n", status); - return status; + return 0; } /* Treat USB reset pretty much the same as suspend/resume */ static int hid_pre_reset(struct usb_interface *intf) { - /* FIXME: What if the interface is already suspended? */ - hid_suspend(intf, PMSG_ON); + struct hid_device *hid = usb_get_intfdata(intf); + struct usbhid_device *usbhid = hid->driver_data; + + spin_lock_irq(&usbhid->lock); + set_bit(HID_RESET_PENDING, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); + hid_cease_io(usbhid); + return 0; } @@ -1094,11 +1310,35 @@ static int hid_pre_reset(struct usb_interface *intf) static int hid_post_reset(struct usb_interface *intf) { struct usb_device *dev = interface_to_usbdev (intf); - + struct hid_device *hid = usb_get_intfdata(intf); + struct usbhid_device *usbhid = hid->driver_data; + int status; + + spin_lock_irq(&usbhid->lock); + clear_bit(HID_RESET_PENDING, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0); /* FIXME: Any more reinitialization needed? */ + status = hid_start_in(hid); + if (status < 0) + hid_io_error(hid); + usbhid_restart_queues(usbhid); - return hid_resume(intf); + return 0; +} + +int usbhid_get_power(struct hid_device *hid) +{ + struct usbhid_device *usbhid = hid->driver_data; + + return usb_autopm_get_interface(usbhid->intf); +} + +void usbhid_put_power(struct hid_device *hid) +{ + struct usbhid_device *usbhid = hid->driver_data; + + usb_autopm_put_interface(usbhid->intf); } static struct usb_device_id hid_usb_ids [] = { @@ -1134,7 +1374,11 @@ static struct hid_driver hid_usb_driver = { static int __init hid_init(void) { - int retval; + int retval = -ENOMEM; + + resumption_waker = create_freezeable_workqueue("usbhid_resumer"); + if (!resumption_waker) + goto no_queue; retval = hid_register_driver(&hid_usb_driver); if (retval) goto hid_register_fail; @@ -1158,6 +1402,8 @@ hiddev_init_fail: usbhid_quirks_init_fail: hid_unregister_driver(&hid_usb_driver); hid_register_fail: + destroy_workqueue(resumption_waker); +no_queue: return retval; } @@ -1167,6 +1413,7 @@ static void __exit hid_exit(void) hiddev_exit(); usbhid_quirks_exit(); hid_unregister_driver(&hid_usb_driver); + destroy_workqueue(resumption_waker); } module_init(hid_init); diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c index 1f5b5d4c3c34..fd7375627e5d 100644 --- a/drivers/hid/usbhid/hiddev.c +++ b/drivers/hid/usbhid/hiddev.c @@ -249,10 +249,12 @@ static int hiddev_release(struct inode * inode, struct file * file) spin_unlock_irqrestore(&list->hiddev->list_lock, flags); if (!--list->hiddev->open) { - if (list->hiddev->exist) + if (list->hiddev->exist) { usbhid_close(list->hiddev->hid); - else + usbhid_put_power(list->hiddev->hid); + } else { kfree(list->hiddev); + } } kfree(list); @@ -303,6 +305,17 @@ static int hiddev_open(struct inode *inode, struct file *file) list_add_tail(&list->node, &hiddev_table[i]->list); spin_unlock_irq(&list->hiddev->list_lock); + if (!list->hiddev->open++) + if (list->hiddev->exist) { + struct hid_device *hid = hiddev_table[i]->hid; + res = usbhid_get_power(hid); + if (res < 0) { + res = -EIO; + goto bail; + } + usbhid_open(hid); + } + return 0; bail: file->private_data = NULL; diff --git a/drivers/hid/usbhid/usbhid.h b/drivers/hid/usbhid/usbhid.h index 9eb30564be9c..08f505ca2e3d 100644 --- a/drivers/hid/usbhid/usbhid.h +++ b/drivers/hid/usbhid/usbhid.h @@ -38,7 +38,10 @@ int usbhid_wait_io(struct hid_device* hid); void usbhid_close(struct hid_device *hid); int usbhid_open(struct hid_device *hid); void usbhid_init_reports(struct hid_device *hid); -void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, unsigned char dir); +void usbhid_submit_report +(struct hid_device *hid, struct hid_report *report, unsigned char dir); +int usbhid_get_power(struct hid_device *hid); +void usbhid_put_power(struct hid_device *hid); /* iofl flags */ #define HID_CTRL_RUNNING 1 @@ -49,6 +52,9 @@ void usbhid_submit_report(struct hid_device *hid, struct hid_report *report, uns #define HID_CLEAR_HALT 6 #define HID_DISCONNECTED 7 #define HID_STARTED 8 +#define HID_REPORTED_IDLE 9 +#define HID_KEYS_PRESSED 10 +#define HID_LED_ON 11 /* * USB-specific HID struct, to be pointed to @@ -66,7 +72,6 @@ struct usbhid_device { struct urb *urbin; /* Input URB */ char *inbuf; /* Input buffer */ dma_addr_t inbuf_dma; /* Input buffer dma */ - spinlock_t inlock; /* Input fifo spinlock */ struct urb *urbctrl; /* Control URB */ struct usb_ctrlrequest *cr; /* Control request struct */ @@ -75,21 +80,22 @@ struct usbhid_device { unsigned char ctrlhead, ctrltail; /* Control fifo head & tail */ char *ctrlbuf; /* Control buffer */ dma_addr_t ctrlbuf_dma; /* Control buffer dma */ - spinlock_t ctrllock; /* Control fifo spinlock */ struct urb *urbout; /* Output URB */ struct hid_output_fifo out[HID_CONTROL_FIFO_SIZE]; /* Output pipe fifo */ unsigned char outhead, outtail; /* Output pipe fifo head & tail */ char *outbuf; /* Output buffer */ dma_addr_t outbuf_dma; /* Output buffer dma */ - spinlock_t outlock; /* Output fifo spinlock */ + spinlock_t lock; /* fifo spinlock */ unsigned long iofl; /* I/O flags (CTRL_RUNNING, OUT_RUNNING) */ struct timer_list io_retry; /* Retry timer */ unsigned long stop_retry; /* Time to give up, in jiffies */ unsigned int retry_delay; /* Delay length in ms */ struct work_struct reset_work; /* Task context for resets */ + struct work_struct restart_work; /* waking up for output to be done in a task */ wait_queue_head_t wait; /* For sleeping */ + int ledcount; /* counting the number of active leds */ }; #define hid_to_usb_dev(hid_dev) \ diff --git a/include/linux/hid.h b/include/linux/hid.h index fa8ee9cef7be..6ac7795a8acc 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -603,12 +603,17 @@ struct hid_ll_driver { int (*open)(struct hid_device *hdev); void (*close)(struct hid_device *hdev); + int (*power)(struct hid_device *hdev, int level); + int (*hidinput_input_event) (struct input_dev *idev, unsigned int type, unsigned int code, int value); int (*parse)(struct hid_device *hdev); }; +#define PM_HINT_FULLON 1<<5 +#define PM_HINT_NORMAL 1<<1 + /* Applications from HID Usage Tables 4/8/99 Version 1.1 */ /* We ignore a few input applications that are not widely used */ #define IS_INPUT_APPLICATION(a) (((a >= 0x00010000) && (a <= 0x00010008)) || (a == 0x00010080) || (a == 0x000c0001) || (a == 0x000d0002)) @@ -641,6 +646,7 @@ int hidinput_find_field(struct hid_device *hid, unsigned int type, unsigned int void hid_output_report(struct hid_report *report, __u8 *data); struct hid_device *hid_allocate_device(void); int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); +int hid_check_keys_pressed(struct hid_device *hid); int hid_connect(struct hid_device *hid, unsigned int connect_mask); /** -- cgit v1.2.3-59-g8ed1b From 0f6f1407e3e4a2cc4f602806d3c174473850f4da Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 19 Jan 2009 09:17:18 +0100 Subject: HID: do not try to compile PM code with CONFIG_PM unset Fixes this build breakage in case when CONFIG_PM is not set drivers/hid/usbhid/hid-core.c: In function 'hid_suspend': drivers/hid/usbhid/hid-core.c:1220: error: 'struct usb_device' has no member named 'auto_pm' drivers/hid/usbhid/hid-core.c:1245: error: 'struct usb_device' has no member named 'auto_pm' drivers/hid/usbhid/hid-core.c:1258: error: 'struct usb_device' has no member named 'auto_pm' by throwing both the hid_suspend() and hid_resume() away completely in such case, as they won't be used anyway. Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 625e7e8eb373..d8799a18a422 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -1210,6 +1210,7 @@ static void hid_cease_io(struct usbhid_device *usbhid) flush_scheduled_work(); } +#ifdef CONFIG_PM static int hid_suspend(struct usb_interface *intf, pm_message_t message) { struct hid_device *hid = usb_get_intfdata(intf); @@ -1292,6 +1293,8 @@ static int hid_resume(struct usb_interface *intf) return 0; } +#endif /* CONFIG_PM */ + /* Treat USB reset pretty much the same as suspend/resume */ static int hid_pre_reset(struct usb_interface *intf) { @@ -1353,9 +1356,11 @@ static struct usb_driver hid_driver = { .name = "usbhid", .probe = hid_probe, .disconnect = hid_disconnect, +#ifdef CONFIG_PM .suspend = hid_suspend, .resume = hid_resume, .reset_resume = hid_post_reset, +#endif .pre_reset = hid_pre_reset, .post_reset = hid_post_reset, .id_table = hid_usb_ids, -- cgit v1.2.3-59-g8ed1b From 378a0ede126a2c59ab9ea1771de624521db5053e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 18 Feb 2009 11:46:45 +0100 Subject: HID: fix USB HID devices after STD with autosuspend This patch fixes a bug caused by reset_resume not changing the internal status flags for a device that is resumed via reset_resume. To do so the reset handlers, which correctly assume that a device is awake, can no longer do all the work of reset_resume handling. Signed-off-by: Oliver Neukum Tested-by: Laurent Riffard Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index d8799a18a422..d79100a01b05 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -1330,6 +1330,15 @@ static int hid_post_reset(struct usb_interface *intf) return 0; } +static int hid_reset_resume(struct usb_interface *intf) +{ + struct hid_device *hid = usb_get_intfdata(intf); + struct usbhid_device *usbhid = hid->driver_data; + + clear_bit(HID_REPORTED_IDLE, &usbhid->iofl); + return hid_post_reset(intf); +} + int usbhid_get_power(struct hid_device *hid) { struct usbhid_device *usbhid = hid->driver_data; @@ -1359,7 +1368,7 @@ static struct usb_driver hid_driver = { #ifdef CONFIG_PM .suspend = hid_suspend, .resume = hid_resume, - .reset_resume = hid_post_reset, + .reset_resume = hid_reset_resume, #endif .pre_reset = hid_pre_reset, .post_reset = hid_post_reset, -- cgit v1.2.3-59-g8ed1b From ae2f007468223e9efd6973be0364b9307a050a0c Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 20 Feb 2009 12:47:08 +0100 Subject: HID: hid_reset_resume() needs to be defined only when CONFIG_PM is set There is no point of having hid_reset_resume() when CONFIG_PM is not set, and even the corresponding .reset_resume pointer in hid_driver struct is properly ifdefed. Move the definition into the ifdef CONFIG_PM part of the source to avoid drivers/hid/usbhid/hid-core.c:1337: warning: 'hid_reset_resume' defined but not used Reported-by: Stephen Rothwell Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 101 +++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 50 deletions(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index d79100a01b05..054f0c521e6f 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -1210,6 +1210,56 @@ static void hid_cease_io(struct usbhid_device *usbhid) flush_scheduled_work(); } +/* Treat USB reset pretty much the same as suspend/resume */ +static int hid_pre_reset(struct usb_interface *intf) +{ + struct hid_device *hid = usb_get_intfdata(intf); + struct usbhid_device *usbhid = hid->driver_data; + + spin_lock_irq(&usbhid->lock); + set_bit(HID_RESET_PENDING, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); + hid_cease_io(usbhid); + + return 0; +} + +/* Same routine used for post_reset and reset_resume */ +static int hid_post_reset(struct usb_interface *intf) +{ + struct usb_device *dev = interface_to_usbdev (intf); + struct hid_device *hid = usb_get_intfdata(intf); + struct usbhid_device *usbhid = hid->driver_data; + int status; + + spin_lock_irq(&usbhid->lock); + clear_bit(HID_RESET_PENDING, &usbhid->iofl); + spin_unlock_irq(&usbhid->lock); + hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0); + /* FIXME: Any more reinitialization needed? */ + status = hid_start_in(hid); + if (status < 0) + hid_io_error(hid); + usbhid_restart_queues(usbhid); + + return 0; +} + +int usbhid_get_power(struct hid_device *hid) +{ + struct usbhid_device *usbhid = hid->driver_data; + + return usb_autopm_get_interface(usbhid->intf); +} + +void usbhid_put_power(struct hid_device *hid) +{ + struct usbhid_device *usbhid = hid->driver_data; + + usb_autopm_put_interface(usbhid->intf); +} + + #ifdef CONFIG_PM static int hid_suspend(struct usb_interface *intf, pm_message_t message) { @@ -1293,43 +1343,6 @@ static int hid_resume(struct usb_interface *intf) return 0; } -#endif /* CONFIG_PM */ - -/* Treat USB reset pretty much the same as suspend/resume */ -static int hid_pre_reset(struct usb_interface *intf) -{ - struct hid_device *hid = usb_get_intfdata(intf); - struct usbhid_device *usbhid = hid->driver_data; - - spin_lock_irq(&usbhid->lock); - set_bit(HID_RESET_PENDING, &usbhid->iofl); - spin_unlock_irq(&usbhid->lock); - hid_cease_io(usbhid); - - return 0; -} - -/* Same routine used for post_reset and reset_resume */ -static int hid_post_reset(struct usb_interface *intf) -{ - struct usb_device *dev = interface_to_usbdev (intf); - struct hid_device *hid = usb_get_intfdata(intf); - struct usbhid_device *usbhid = hid->driver_data; - int status; - - spin_lock_irq(&usbhid->lock); - clear_bit(HID_RESET_PENDING, &usbhid->iofl); - spin_unlock_irq(&usbhid->lock); - hid_set_idle(dev, intf->cur_altsetting->desc.bInterfaceNumber, 0, 0); - /* FIXME: Any more reinitialization needed? */ - status = hid_start_in(hid); - if (status < 0) - hid_io_error(hid); - usbhid_restart_queues(usbhid); - - return 0; -} - static int hid_reset_resume(struct usb_interface *intf) { struct hid_device *hid = usb_get_intfdata(intf); @@ -1339,19 +1352,7 @@ static int hid_reset_resume(struct usb_interface *intf) return hid_post_reset(intf); } -int usbhid_get_power(struct hid_device *hid) -{ - struct usbhid_device *usbhid = hid->driver_data; - - return usb_autopm_get_interface(usbhid->intf); -} - -void usbhid_put_power(struct hid_device *hid) -{ - struct usbhid_device *usbhid = hid->driver_data; - - usb_autopm_put_interface(usbhid->intf); -} +#endif /* CONFIG_PM */ static struct usb_device_id hid_usb_ids [] = { { .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS, -- cgit v1.2.3-59-g8ed1b From 6d77976800c3f29a0337fadcc9ddd79050fa5620 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Sun, 22 Mar 2009 18:01:49 +0100 Subject: HID: autosuspend -- fix lockup of hid on reset This fixes a use of flush_scheduled_work() in USB HID's reset logic that can deadlock. Tested-by: Valdis Kletniks Signed-off-by: Oliver Neukum Signed-off-by: Jiri Kosina --- drivers/hid/usbhid/hid-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 054f0c521e6f..a1ef41f067ce 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -1207,7 +1207,6 @@ static void hid_cease_io(struct usbhid_device *usbhid) usb_kill_urb(usbhid->urbin); usb_kill_urb(usbhid->urbctrl); usb_kill_urb(usbhid->urbout); - flush_scheduled_work(); } /* Treat USB reset pretty much the same as suspend/resume */ @@ -1219,6 +1218,7 @@ static int hid_pre_reset(struct usb_interface *intf) spin_lock_irq(&usbhid->lock); set_bit(HID_RESET_PENDING, &usbhid->iofl); spin_unlock_irq(&usbhid->lock); + cancel_work_sync(&usbhid->restart_work); hid_cease_io(usbhid); return 0; -- cgit v1.2.3-59-g8ed1b From e6f489013b985b58d096a3091ece0ed579367232 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:27:17 +0800 Subject: trace_stat: don't call seq_printf() in seq_operation->start() Impact: Fix incorrect way using seq_file's API Use SEQ_START_TOKEN instead of calling ->stat_headers() int seq_operation->start(). Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Cc: Alexey Dobriyan LKML-Reference: <49C9EAE5.5070202@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cfe..8c129dd480ad 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -163,7 +163,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) /* If we are in the beginning of the file, print the headers */ if (!*pos && session->ts->stat_headers) - session->ts->stat_headers(s); + return SEQ_START_TOKEN; return seq_list_start(&session->stat_list, *pos); } @@ -172,6 +172,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct tracer_stat_session *session = s->private; + if (p == SEQ_START_TOKEN) + return seq_list_start(&session->stat_list, *pos); + return seq_list_next(p, &session->stat_list, pos); } @@ -186,6 +189,9 @@ static int stat_seq_show(struct seq_file *s, void *v) struct tracer_stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + return session->ts->stat_show(s, l->stat); } -- cgit v1.2.3-59-g8ed1b From 220ba351dfa57eca4bec5ce0098a276446a47958 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:58:39 +0800 Subject: trace_stat: keep original order Impact: make trace_stat files show items with the original order trace_stat tracer reverse the items, it makes the output looks a little ugly. Example, when we read trace_stat/workqueues, we get cpu#7's stat. at first, and then cpu#6... cpu#0. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C9F23F.5040307@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stat.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8c129dd480ad..acdebd771a93 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session) INIT_LIST_HEAD(&new_entry->list); new_entry->stat = stat; - list_for_each_entry(iter_entry, &session->stat_list, list) { + list_for_each_entry_reverse(iter_entry, &session->stat_list, + list) { /* Insertion with a descendent sorting */ - if (ts->stat_cmp(new_entry->stat, - iter_entry->stat) > 0) { + if (ts->stat_cmp(iter_entry->stat, + new_entry->stat) >= 0) { - list_add_tail(&new_entry->list, - &iter_entry->list); - break; - - /* The current smaller value */ - } else if (list_is_last(&iter_entry->list, - &session->stat_list)) { list_add(&new_entry->list, &iter_entry->list); break; } } + + /* The current larger value */ + if (list_empty(&new_entry->list)) + list_add(&new_entry->list, &session->stat_list); } exit: mutex_unlock(&session->stat_mutex); -- cgit v1.2.3-59-g8ed1b From 2f63b840bc8a816ac879ee773b035cf3e433fae4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 16:59:18 +0800 Subject: trace_workqueues: fix empty line's output Empty lines separate cpus stat. After previous fix(trace_stat: keep original order) applied, the empty lines are displayed at incorrect position. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <49C9F266.2060706@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf1..797201e4a137 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p) struct pid *pid; struct task_struct *tsk; + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (&cws->list == workqueue_cpu_stat(cpu)->list.next) + seq_printf(s, "\n"); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); @@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p) put_pid(pid); } - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - return 0; } static int workqueue_stat_headers(struct seq_file *s) { seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n\n"); + seq_printf(s, "# | | | |\n"); return 0; } -- cgit v1.2.3-59-g8ed1b From 759ee0915dd713361e72facb78b66600b5712d65 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 25 Mar 2009 17:06:30 +0800 Subject: init,cpuset: fix initialize order Impact: cpuset_wq should be initialized after init_workqueues() When I read /debugfs/tracing/trace_stat/workqueues, I got this: # CPU INSERTED EXECUTED NAME # | | | | 0 0 0 cpuset 0 285 285 events/0 0 2 2 work_on_cpu/0 0 1115 1115 khelper 0 325 325 kblockd/0 0 0 0 kacpid 0 0 0 kacpi_notify 0 0 0 ata/0 0 0 0 ata_aux 0 0 0 ksuspend_usbd 0 0 0 aio/0 0 0 0 nfsiod 0 0 0 kpsmoused 0 0 0 kstriped 0 0 0 kondemand/0 0 1 1 hid_compat 0 0 0 rpciod/0 1 64 64 events/1 1 2 2 work_on_cpu/1 1 5 5 kblockd/1 1 0 0 ata/1 1 0 0 aio/1 1 0 0 kondemand/1 1 0 0 rpciod/1 I found "cpuset" is at the earliest. I found a create_singlethread_workqueue() is earlier than init_workqueues(): kernel_init() ->cpuset_init_smp() ->create_singlethread_workqueue() ->do_basic_setup() ->init_workqueues() I think it's better that create_singlethread_workqueue() is called after workqueue subsystem has been initialized. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Paul Menage Cc: miaoxie Cc: Li Zefan Cc: Andrew Morton LKML-Reference: <49C9F416.1050707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- init/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index 20d784ab5ef8..b0097d2b63ae 100644 --- a/init/main.c +++ b/init/main.c @@ -772,6 +772,7 @@ static void __init do_basic_setup(void) { rcu_init_sched(); /* needed by module_init stage. */ init_workqueues(); + cpuset_init_smp(); usermodehelper_init(); driver_init(); init_irq_proc(); @@ -865,8 +866,6 @@ static int __init kernel_init(void * unused) smp_init(); sched_init_smp(); - cpuset_init_smp(); - do_basic_setup(); /* -- cgit v1.2.3-59-g8ed1b From fee039a1d05c6e0f71b0fe270d847742a02d56c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 23 Mar 2009 10:14:52 -0400 Subject: x86: kretprobe-booster interrupt emulation code fix Fix interrupt emulation code in kretprobe-booster according to pt_regs update (es/ds change and gs adding). This issue has been reported on systemtap-bugzilla: http://sources.redhat.com/bugzilla/show_bug.cgi?id=9965 | On a -tip kernel on x86_32, kretprobe_example (from samples) triggers the | following backtrace when its retprobing a class of functions that cause a | copy_from/to_user(). | | BUG: sleeping function called from invalid context at mm/memory.c:3196 | in_atomic(): 0, irqs_disabled(): 1, pid: 2286, name: cat Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Tested-by: Bharata B Rao Cc: systemtap-ml LKML-Reference: <49C7995C.2010601@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4558dd3918cf..759095d53a06 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -638,13 +638,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void) #else " pushf\n" /* - * Skip cs, ip, orig_ax. + * Skip cs, ip, orig_ax and gs. * trampoline_handler() will plug in these values */ - " subl $12, %esp\n" + " subl $16, %esp\n" " pushl %fs\n" - " pushl %ds\n" " pushl %es\n" + " pushl %ds\n" " pushl %eax\n" " pushl %ebp\n" " pushl %edi\n" @@ -655,10 +655,10 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ - " movl %eax, 52(%esp)\n" + " movl %eax, 56(%esp)\n" " popl %ebx\n" " popl %ecx\n" " popl %edx\n" @@ -666,8 +666,8 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* Skip ip, orig_ax, es, ds, fs */ - " addl $20, %esp\n" + /* Skip ds, es, fs, gs, orig_ax and ip */ + " addl $24, %esp\n" " popf\n" #endif " ret\n"); @@ -691,6 +691,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) regs->cs = __KERNEL_CS; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; -- cgit v1.2.3-59-g8ed1b From 2a4efa42450762cbfa5c5712aa4cc9f06924c9fd Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 25 Mar 2009 12:06:05 +0800 Subject: ftrace: Using FTRACE_WARN_ON() to check "freed record" in ftrace_release() "Because when we call ftrace_free_rec we change the rec->ip to point to the next record in the chain. Something is very wrong if rec->ip >= s && rec->ip < e and the record is already free." "Note, use FTRACE_WARN_ON() macro. This way it shuts down ftrace if it is hit and helps to avoid further damage later." -- Steven Rostedt Signed-off-by: Zhao Lei Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf153..1752a63f37c0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -358,9 +358,14 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); ftrace_free_rec(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3-59-g8ed1b From 9a8118baaeb0eaa148913bed77bf9c6335f6ca63 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 26 Mar 2009 01:24:34 -0500 Subject: tracing: filter fix for TRACE_EVENT_FORMAT events Impact: fix crash (hang) when using TRACE_EVENT_FORMAT filter files filters are only hooked up to the tracepoint events defined using TRACE_EVENT but not the tracers that use TRACE_EVENT_FORMAT, such as ftrace. Do not display the filter files at all for TRACE_EVENT_FORMAT events for the time being. Cc: Steven Rostedt Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker LKML-Reference: <1237878882.8339.61.camel@charm-linux> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d132997ab756..64ec4d278ffb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,7 +680,6 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; - struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -709,12 +708,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; - entry = debugfs_create_file("filter", 0644, system->entry, system, - &ftrace_subsystem_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); - return system->entry; } @@ -770,14 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } + entry = debugfs_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", call->name); } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); - /* A trace may not want to export its format */ if (!call->show_format) return 0; -- cgit v1.2.3-59-g8ed1b From ef419021907b2daa51ac31bdc1f858b1b1338b07 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:14 +0100 Subject: avr32: fix 15-bit LCDC pin mask to use MSB lines This patch corrects the 15-bit LCDC pin mask definitions to select the five upper lines in each color byte from the LCDC data output. When reducing the color depth the LCDC will start filling MSB and downwards. Also only enable 5 bits per color as the define indicates. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/mach-at32ap/include/mach/at32ap700x.h | 34 +++++++++++++----------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/arch/avr32/mach-at32ap/include/mach/at32ap700x.h b/arch/avr32/mach-at32ap/include/mach/at32ap700x.h index 5c4c971eed8e..f07fcd7390be 100644 --- a/arch/avr32/mach-at32ap/include/mach/at32ap700x.h +++ b/arch/avr32/mach-at32ap/include/mach/at32ap700x.h @@ -172,24 +172,26 @@ ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23)) #define ATMEL_LCDC_PRI_15B_DATA ( \ - ATMEL_LCDC(PC, DATA0) | ATMEL_LCDC(PC, DATA1) | \ - ATMEL_LCDC(PC, DATA2) | ATMEL_LCDC(PC, DATA3) | \ - ATMEL_LCDC(PC, DATA4) | ATMEL_LCDC(PC, DATA5) | \ - ATMEL_LCDC(PD, DATA8) | ATMEL_LCDC(PD, DATA9) | \ - ATMEL_LCDC(PD, DATA10) | ATMEL_LCDC(PD, DATA11) | \ - ATMEL_LCDC(PD, DATA12) | ATMEL_LCDC(PD, DATA16) | \ - ATMEL_LCDC(PD, DATA17) | ATMEL_LCDC(PD, DATA18) | \ - ATMEL_LCDC(PD, DATA19) | ATMEL_LCDC(PD, DATA20)) + ATMEL_LCDC(PC, DATA3) | ATMEL_LCDC(PC, DATA4) | \ + ATMEL_LCDC(PC, DATA5) | ATMEL_LCDC(PD, DATA6) | \ + ATMEL_LCDC(PD, DATA7) | \ + ATMEL_LCDC(PD, DATA11) | ATMEL_LCDC(PD, DATA12) | \ + ATMEL_LCDC(PD, DATA13) | ATMEL_LCDC(PD, DATA14) | \ + ATMEL_LCDC(PD, DATA15) | \ + ATMEL_LCDC(PD, DATA19) | ATMEL_LCDC(PD, DATA20) | \ + ATMEL_LCDC(PD, DATA21) | ATMEL_LCDC(PD, DATA22) | \ + ATMEL_LCDC(PD, DATA23)) #define ATMEL_LCDC_ALT_15B_DATA ( \ - ATMEL_LCDC(PE, DATA0) | ATMEL_LCDC(PE, DATA1) | \ - ATMEL_LCDC(PE, DATA2) | ATMEL_LCDC(PE, DATA3) | \ - ATMEL_LCDC(PE, DATA4) | ATMEL_LCDC(PC, DATA5) | \ - ATMEL_LCDC(PE, DATA8) | ATMEL_LCDC(PE, DATA9) | \ - ATMEL_LCDC(PE, DATA10) | ATMEL_LCDC(PE, DATA11) | \ - ATMEL_LCDC(PE, DATA12) | ATMEL_LCDC(PE, DATA16) | \ - ATMEL_LCDC(PE, DATA17) | ATMEL_LCDC(PE, DATA18) | \ - ATMEL_LCDC(PE, DATA19) | ATMEL_LCDC(PE, DATA20)) + ATMEL_LCDC(PE, DATA3) | ATMEL_LCDC(PE, DATA4) | \ + ATMEL_LCDC(PC, DATA5) | ATMEL_LCDC(PD, DATA6) | \ + ATMEL_LCDC(PD, DATA7) | \ + ATMEL_LCDC(PE, DATA11) | ATMEL_LCDC(PE, DATA12) | \ + ATMEL_LCDC(PD, DATA13) | ATMEL_LCDC(PD, DATA14) | \ + ATMEL_LCDC(PD, DATA15) | \ + ATMEL_LCDC(PE, DATA19) | ATMEL_LCDC(PE, DATA20) | \ + ATMEL_LCDC(PE, DATA21) | ATMEL_LCDC(PD, DATA22) | \ + ATMEL_LCDC(PD, DATA23)) #define ATMEL_LCDC_PRI_CONTROL ( \ ATMEL_LCDC(PC, CC) | ATMEL_LCDC(PC, DVAL) | \ -- cgit v1.2.3-59-g8ed1b From 2ae6d5d8f067753e29248863ad7a242eff6fdb0b Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:15 +0100 Subject: avr32: add pin mask for 18-bit color on the LCD controller This patch adds two defines for setting a pin mask for 18-bit LCD panels connected to the LCD controller. One mask for primary output and one mask for alternative output. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/mach-at32ap/include/mach/at32ap700x.h | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/avr32/mach-at32ap/include/mach/at32ap700x.h b/arch/avr32/mach-at32ap/include/mach/at32ap700x.h index f07fcd7390be..b9222bf895bc 100644 --- a/arch/avr32/mach-at32ap/include/mach/at32ap700x.h +++ b/arch/avr32/mach-at32ap/include/mach/at32ap700x.h @@ -171,6 +171,28 @@ ATMEL_LCDC(PE, DATA20) | ATMEL_LCDC(PE, DATA21) | \ ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23)) +#define ATMEL_LCDC_PRI_18B_DATA ( \ + ATMEL_LCDC(PC, DATA2) | ATMEL_LCDC(PC, DATA3) | \ + ATMEL_LCDC(PC, DATA4) | ATMEL_LCDC(PC, DATA5) | \ + ATMEL_LCDC(PD, DATA6) | ATMEL_LCDC(PD, DATA7) | \ + ATMEL_LCDC(PD, DATA10) | ATMEL_LCDC(PD, DATA11) | \ + ATMEL_LCDC(PD, DATA12) | ATMEL_LCDC(PD, DATA13) | \ + ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) | \ + ATMEL_LCDC(PD, DATA18) | ATMEL_LCDC(PD, DATA19) | \ + ATMEL_LCDC(PD, DATA20) | ATMEL_LCDC(PD, DATA21) | \ + ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23)) + +#define ATMEL_LCDC_ALT_18B_DATA ( \ + ATMEL_LCDC(PE, DATA2) | ATMEL_LCDC(PE, DATA3) | \ + ATMEL_LCDC(PE, DATA4) | ATMEL_LCDC(PC, DATA5) | \ + ATMEL_LCDC(PD, DATA6) | ATMEL_LCDC(PD, DATA7) | \ + ATMEL_LCDC(PE, DATA10) | ATMEL_LCDC(PE, DATA11) | \ + ATMEL_LCDC(PE, DATA12) | ATMEL_LCDC(PD, DATA13) | \ + ATMEL_LCDC(PD, DATA14) | ATMEL_LCDC(PD, DATA15) | \ + ATMEL_LCDC(PE, DATA18) | ATMEL_LCDC(PE, DATA19) | \ + ATMEL_LCDC(PE, DATA20) | ATMEL_LCDC(PE, DATA21) | \ + ATMEL_LCDC(PD, DATA22) | ATMEL_LCDC(PD, DATA23)) + #define ATMEL_LCDC_PRI_15B_DATA ( \ ATMEL_LCDC(PC, DATA3) | ATMEL_LCDC(PC, DATA4) | \ ATMEL_LCDC(PC, DATA5) | ATMEL_LCDC(PD, DATA6) | \ @@ -209,6 +231,10 @@ #define ATMEL_LCDC_ALT_24BIT (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_24B_DATA) +#define ATMEL_LCDC_PRI_18BIT (ATMEL_LCDC_CONTROL | ATMEL_LCDC_PRI_18B_DATA) + +#define ATMEL_LCDC_ALT_18BIT (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_18B_DATA) + #define ATMEL_LCDC_PRI_15BIT (ATMEL_LCDC_CONTROL | ATMEL_LCDC_PRI_15B_DATA) #define ATMEL_LCDC_ALT_15BIT (ATMEL_LCDC_CONTROL | ATMEL_LCDC_ALT_15B_DATA) -- cgit v1.2.3-59-g8ed1b From ecb1bd894e7238acfc6174602c29190d239be9b1 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:16 +0100 Subject: avr32: set pin mask to alternative 18 bpp for EVKLCD10x boards This patch will set the pin mask to alternative 18 bits per pixel output for EVKLCD10x boards. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/evklcd10x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/avr32/boards/atngw100/evklcd10x.c b/arch/avr32/boards/atngw100/evklcd10x.c index 8140b22b3461..fbc7b03ef886 100644 --- a/arch/avr32/boards/atngw100/evklcd10x.c +++ b/arch/avr32/boards/atngw100/evklcd10x.c @@ -149,7 +149,8 @@ static int __init atevklcd10x_init(void) at32_add_device_ac97c(0, &ac97c0_data); at32_add_device_lcdc(0, &atevklcd10x_lcdc_data, - fbmem_start, fbmem_size, 1); + fbmem_start, fbmem_size, + ATMEL_LCDC_ALT_18BIT | ATMEL_LCDC_PE_DVAL); return 0; } postcore_initcall(atevklcd10x_init); -- cgit v1.2.3-59-g8ed1b From fe272b5bd13d3522f9d1ed35425f1c7af4d8343f Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:17 +0100 Subject: avr32: configure MCI detect and write protect pins for EVKLCD10x boards This patch removes the special handling of MCI platform data for EVKLCD10x boards. This is now possible since the pin mask for the LCD controller is no longer reserving the I/O lines used for MCI card detection and write protect. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/setup.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/avr32/boards/atngw100/setup.c b/arch/avr32/boards/atngw100/setup.c index 05d3722fff18..feac0035800c 100644 --- a/arch/avr32/boards/atngw100/setup.c +++ b/arch/avr32/boards/atngw100/setup.c @@ -56,13 +56,8 @@ static struct spi_board_info spi0_board_info[] __initdata = { static struct mci_platform_data __initdata mci0_data = { .slot[0] = { .bus_width = 4, -#ifndef CONFIG_BOARD_ATNGW100_EVKLCD10X .detect_pin = GPIO_PIN_PC(25), .wp_pin = GPIO_PIN_PE(0), -#else - .detect_pin = GPIO_PIN_NONE, - .wp_pin = GPIO_PIN_NONE, -#endif }, }; -- cgit v1.2.3-59-g8ed1b From 30754acf29a9329185ffd8dffbb0c83b5a539818 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:18 +0100 Subject: avr32: use GPIO line PB15 on EVKLCD10x boards for backlight The PB15 GPIO line is used to control the enable and disable signal for the backlight regulator on EVKLCD10x boards. This patch hands the I/O line over to the LCDC driver, which will control when to enable and disable the backlight. Signed-off-by: Hans-Christian Egtvedt [haavard.skinnemoen@atmel.com: reverted ac97c change] Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/evklcd10x.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/avr32/boards/atngw100/evklcd10x.c b/arch/avr32/boards/atngw100/evklcd10x.c index fbc7b03ef886..4e3ab8e08007 100644 --- a/arch/avr32/boards/atngw100/evklcd10x.c +++ b/arch/avr32/boards/atngw100/evklcd10x.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -19,6 +20,7 @@ #include #include +#include #include static struct ac97c_platform_data __initdata ac97c0_data = { @@ -144,13 +146,29 @@ static struct atmel_lcdfb_info __initdata atevklcd10x_lcdc_data = { }; #endif +static void atevklcd10x_lcdc_power_control(int on) +{ + gpio_set_value(GPIO_PIN_PB(15), on); +} + static int __init atevklcd10x_init(void) { - at32_add_device_ac97c(0, &ac97c0_data); + /* PB15 is connected to the enable line on the boost regulator + * controlling the backlight for the LCD panel. + */ + at32_select_gpio(GPIO_PIN_PB(15), AT32_GPIOF_OUTPUT); + gpio_request(GPIO_PIN_PB(15), "backlight"); + gpio_direction_output(GPIO_PIN_PB(15), 0); + + atevklcd10x_lcdc_data.atmel_lcdfb_power_control = + atevklcd10x_lcdc_power_control; at32_add_device_lcdc(0, &atevklcd10x_lcdc_data, fbmem_start, fbmem_size, ATMEL_LCDC_ALT_18BIT | ATMEL_LCDC_PE_DVAL); + + at32_add_device_ac97c(0, &ac97c0_data); + return 0; } postcore_initcall(atevklcd10x_init); -- cgit v1.2.3-59-g8ed1b From b2a49ed0083ec5f51e9e0ed2e739b9b6259e330c Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:19 +0100 Subject: avr32: fix timing LCD parameters for EVKLCD10X boards This patch adjusts the timing parameters for the Kyocera LCD panels connected on the EVKLCD10X addon boards. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/boards/atngw100/evklcd10x.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/avr32/boards/atngw100/evklcd10x.c b/arch/avr32/boards/atngw100/evklcd10x.c index 4e3ab8e08007..ae6991205409 100644 --- a/arch/avr32/boards/atngw100/evklcd10x.c +++ b/arch/avr32/boards/atngw100/evklcd10x.c @@ -33,14 +33,14 @@ static struct ac97c_platform_data __initdata ac97c0_data = { #ifdef CONFIG_BOARD_ATNGW100_EVKLCD10X_VGA static struct fb_videomode __initdata tcg057vglad_modes[] = { { - .name = "640x480 @ 60", - .refresh = 60, + .name = "640x480 @ 50", + .refresh = 50, .xres = 640, .yres = 480, .pixclock = KHZ2PICOS(25180), - .left_margin = 64, .right_margin = 31, - .upper_margin = 34, .lower_margin = 2, - .hsync_len = 96, .vsync_len = 4, + .left_margin = 64, .right_margin = 96, + .upper_margin = 34, .lower_margin = 11, + .hsync_len = 64, .vsync_len = 15, .sync = 0, .vmode = FB_VMODE_NONINTERLACED, @@ -71,14 +71,14 @@ static struct atmel_lcdfb_info __initdata atevklcd10x_lcdc_data = { #elif CONFIG_BOARD_ATNGW100_EVKLCD10X_QVGA static struct fb_videomode __initdata tcg057qvlad_modes[] = { { - .name = "320x240 @ 60", - .refresh = 60, + .name = "320x240 @ 50", + .refresh = 50, .xres = 320, .yres = 240, .pixclock = KHZ2PICOS(6300), - .left_margin = 52, .right_margin = 28, - .upper_margin = 7, .lower_margin = 2, - .hsync_len = 96, .vsync_len = 4, + .left_margin = 34, .right_margin = 46, + .upper_margin = 7, .lower_margin = 15, + .hsync_len = 64, .vsync_len = 12, .sync = 0, .vmode = FB_VMODE_NONINTERLACED, -- cgit v1.2.3-59-g8ed1b From 472a6786b071ea88144e09eeb9b2a77549d98e75 Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:20 +0100 Subject: atmel-usba-udc: use gpio_is_valid() to check vbus_pin I/O line This patch will convert to use gpio_is_valid() to check the vbus_pin platform data. It will also default to -ENODEV if no vbus_pin is defined. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- drivers/usb/gadget/atmel_usba_udc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c index 65b03e3445a1..ee7ebb674416 100644 --- a/drivers/usb/gadget/atmel_usba_udc.c +++ b/drivers/usb/gadget/atmel_usba_udc.c @@ -319,7 +319,7 @@ static inline void usba_cleanup_debugfs(struct usba_udc *udc) static int vbus_is_present(struct usba_udc *udc) { - if (udc->vbus_pin != -1) + if (gpio_is_valid(udc->vbus_pin)) return gpio_get_value(udc->vbus_pin); /* No Vbus detection: Assume always present */ @@ -1821,7 +1821,7 @@ int usb_gadget_register_driver(struct usb_gadget_driver *driver) DBG(DBG_GADGET, "registered driver `%s'\n", driver->driver.name); udc->vbus_prev = 0; - if (udc->vbus_pin != -1) + if (gpio_is_valid(udc->vbus_pin)) enable_irq(gpio_to_irq(udc->vbus_pin)); /* If Vbus is present, enable the controller and wait for reset */ @@ -1852,7 +1852,7 @@ int usb_gadget_unregister_driver(struct usb_gadget_driver *driver) if (driver != udc->driver || !driver->unbind) return -EINVAL; - if (udc->vbus_pin != -1) + if (gpio_is_valid(udc->vbus_pin)) disable_irq(gpio_to_irq(udc->vbus_pin)); spin_lock_irqsave(&udc->lock, flags); @@ -1910,7 +1910,7 @@ static int __init usba_udc_probe(struct platform_device *pdev) udc->pdev = pdev; udc->pclk = pclk; udc->hclk = hclk; - udc->vbus_pin = -1; + udc->vbus_pin = -ENODEV; ret = -ENOMEM; udc->regs = ioremap(regs->start, regs->end - regs->start + 1); @@ -1996,7 +1996,7 @@ static int __init usba_udc_probe(struct platform_device *pdev) goto err_device_add; } - if (pdata->vbus_pin >= 0) { + if (gpio_is_valid(pdata->vbus_pin)) { if (!gpio_request(pdata->vbus_pin, "atmel_usba_udc")) { udc->vbus_pin = pdata->vbus_pin; @@ -2005,7 +2005,7 @@ static int __init usba_udc_probe(struct platform_device *pdev) "atmel_usba_udc", udc); if (ret) { gpio_free(udc->vbus_pin); - udc->vbus_pin = -1; + udc->vbus_pin = -ENODEV; dev_warn(&udc->pdev->dev, "failed to request vbus irq; " "assuming always on\n"); @@ -2051,7 +2051,7 @@ static int __exit usba_udc_remove(struct platform_device *pdev) usba_ep_cleanup_debugfs(&usba_ep[i]); usba_cleanup_debugfs(udc); - if (udc->vbus_pin != -1) + if (gpio_is_valid(udc->vbus_pin)) gpio_free(udc->vbus_pin); free_irq(udc->irq, udc); -- cgit v1.2.3-59-g8ed1b From 9477ab2b2ae098423af2ed4fb1f7b864abfc14fc Mon Sep 17 00:00:00 2001 From: Hans-Christian Egtvedt Date: Tue, 24 Mar 2009 15:45:21 +0100 Subject: avr32: use gpio_is_valid() to check USBA vbus_pin I/O line This patch will use gpio_is_valid() to check the vbus_pin I/O line. Signed-off-by: Hans-Christian Egtvedt Signed-off-by: Haavard Skinnemoen --- arch/avr32/mach-at32ap/at32ap700x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c index 3fbfd1e32a9e..402cae8f9205 100644 --- a/arch/avr32/mach-at32ap/at32ap700x.c +++ b/arch/avr32/mach-at32ap/at32ap700x.c @@ -1753,7 +1753,7 @@ at32_add_device_usba(unsigned int id, struct usba_platform_data *data) if (platform_device_add_data(pdev, data, sizeof(usba_data))) goto out_free_pdev; - if (data->vbus_pin >= 0) + if (gpio_is_valid(data->vbus_pin)) at32_select_gpio(data->vbus_pin, 0); usba0_pclk.dev = &pdev->dev; -- cgit v1.2.3-59-g8ed1b From 963f0cf6d116d83c558a8efe9045c1c5ad7aed34 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 26 Mar 2009 12:51:21 +0200 Subject: UBIFS: add R/O compatibility Now UBIFS is supported by u-boot. If we ever decide to change the media format, then people will have to upgrade their u-boots to mount new format images. However, very often it is possible to preserve R/O forward-compatibility, even though the write forward-compatibility is not preserved. This patch introduces a new super-block field which stores the R/O compatibility version. Signed-off-by: Artem Bityutskiy Acked-by: Adrian Hunter --- fs/ubifs/sb.c | 35 +++++++++++++++++++++++++++++------ fs/ubifs/super.c | 14 ++++++++++++-- fs/ubifs/ubifs-media.h | 30 +++++++++++++++++++++++++++--- fs/ubifs/ubifs.h | 4 ++++ 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 0dec47c87c6d..57085e43320f 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (tmp64 > DEFAULT_MAX_RP_SIZE) tmp64 = DEFAULT_MAX_RP_SIZE; sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); kfree(sup); @@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c) if (IS_ERR(sup)) return PTR_ERR(sup); + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + /* * The software supports all previous versions but not future versions, * due to the unavailability of time-travelling equipment. */ - c->fmt_version = le32_to_cpu(sup->fmt_version); if (c->fmt_version > UBIFS_FORMAT_VERSION) { - ubifs_err("on-flash format version is %d, but software only " - "supports up to version %d", c->fmt_version, - UBIFS_FORMAT_VERSION); - err = -EINVAL; - goto out; + struct super_block *sb = c->vfs_sb; + int mounting_ro = sb->s_flags & MS_RDONLY; + + ubifs_assert(!c->ro_media || mounting_ro); + if (!mounting_ro || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but " + "software only supports up to version " + "w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; } if (c->fmt_version < 3) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 372c7fb66531..302a2056422e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1351,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c) x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("media format: %d (latest is %d)", - c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); ubifs_msg("reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); @@ -1492,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software " + "only supports up to version w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index b25fc36cf72f..3eee07e0c495 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -36,9 +36,31 @@ /* UBIFS node magic number (must not have the padding byte first or last) */ #define UBIFS_NODE_MAGIC 0x06101831 -/* UBIFS on-flash format version */ +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ #define UBIFS_FORMAT_VERSION 4 +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + /* Minimum logical eraseblock size in bytes */ #define UBIFS_MIN_LEB_SZ (15*1024) @@ -53,7 +75,7 @@ /* * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes - * shorter than uncompressed data length, UBIFS preferes to leave this data + * shorter than uncompressed data length, UBIFS prefers to leave this data * node uncompress, because it'll be read faster. */ #define UBIFS_MIN_COMPRESS_DIFF 64 @@ -586,6 +608,7 @@ struct ubifs_pad_node { * @padding2: reserved for future, zeroes * @time_gran: time granularity in nanoseconds * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -612,7 +635,8 @@ struct ubifs_sb_node { __le64 rp_size; __le32 time_gran; __u8 uuid[16]; - __u8 padding2[3972]; + __le32 ro_compat_version; + __u8 padding2[3968]; } __attribute__ ((packed)); /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index a53b9a6df2be..0a8341e14088 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -934,6 +934,7 @@ struct ubifs_debug_info; * by @commit_sem * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version * @uuid: UUID from super block * * @lhead_lnum: log head logical eraseblock number @@ -966,6 +967,7 @@ struct ubifs_debug_info; * recovery) * @bulk_read: enable bulk-reads * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -1179,6 +1181,7 @@ struct ubifs_info { unsigned long long cmt_no; spinlock_t cnt_lock; int fmt_version; + int ro_compat_version; unsigned char uuid[16]; int lhead_lnum; @@ -1207,6 +1210,7 @@ struct ubifs_info { unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; + unsigned int rw_incompat:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; -- cgit v1.2.3-59-g8ed1b From 0f571515c332e00b3515dbe0859ceaa30ab66e00 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Fri, 6 Mar 2009 20:07:14 +0900 Subject: dmaengine: Add privatecnt to revert DMA_PRIVATE property Currently dma_request_channel() set DMA_PRIVATE capability but never clear it. So if a public channel was once grabbed by dma_request_channel(), the device stay PRIVATE forever. Add privatecnt member to dma_device to correctly revert it. [lg@denx.de: fix bad usage of 'chan' in dma_async_device_register] Signed-off-by: Atsushi Nemoto Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- drivers/dma/dmaengine.c | 8 ++++++++ include/linux/dmaengine.h | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index a41d1ea10fa3..92438e9dacc3 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -507,6 +507,7 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v * published in the general-purpose allocator */ dma_cap_set(DMA_PRIVATE, device->cap_mask); + device->privatecnt++; err = dma_chan_get(chan); if (err == -ENODEV) { @@ -518,6 +519,8 @@ struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, v dma_chan_name(chan), err); else break; + if (--device->privatecnt == 0) + dma_cap_clear(DMA_PRIVATE, device->cap_mask); chan->private = NULL; chan = NULL; } @@ -537,6 +540,9 @@ void dma_release_channel(struct dma_chan *chan) WARN_ONCE(chan->client_count != 1, "chan reference count %d != 1\n", chan->client_count); dma_chan_put(chan); + /* drop PRIVATE cap enabled by __dma_request_channel() */ + if (--chan->device->privatecnt == 0) + dma_cap_clear(DMA_PRIVATE, chan->device->cap_mask); chan->private = NULL; mutex_unlock(&dma_list_mutex); } @@ -719,6 +725,8 @@ int dma_async_device_register(struct dma_device *device) } } list_add_tail_rcu(&device->global_node, &dma_device_list); + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + device->privatecnt++; /* Always private */ dma_channel_rebalance(); mutex_unlock(&dma_list_mutex); diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 2afc2c95e42d..2e2aa3df170c 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -202,6 +202,7 @@ struct dma_async_tx_descriptor { /** * struct dma_device - info on the entity supplying DMA services * @chancnt: how many DMA channels are supported + * @privatecnt: how many DMA channels are requested by dma_request_channel * @channels: the list of struct dma_chan * @global_node: list_head for global dma_device_list * @cap_mask: one or more dma_capability flags @@ -224,6 +225,7 @@ struct dma_async_tx_descriptor { struct dma_device { unsigned int chancnt; + unsigned int privatecnt; struct list_head channels; struct list_head global_node; dma_cap_mask_t cap_mask; @@ -352,6 +354,13 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) set_bit(tx_type, dstp->bits); } +#define dma_cap_clear(tx, mask) __dma_cap_clear((tx), &(mask)) +static inline void +__dma_cap_clear(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) +{ + clear_bit(tx_type, dstp->bits); +} + #define dma_cap_zero(mask) __dma_cap_zero(&(mask)) static inline void __dma_cap_zero(dma_cap_mask_t *dstp) { -- cgit v1.2.3-59-g8ed1b From 4bbfb85da27c27e9cc9a7fef4bd75df6361152ff Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 14:17:29 +0800 Subject: ACPICA: Add error check to debug object dump routine Add check for invalid handle in acpi_ns_dump_one_object. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/nsdump.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/acpica/nsdump.c b/drivers/acpi/acpica/nsdump.c index 0da33c8e9ba2..e96d37a596b8 100644 --- a/drivers/acpi/acpica/nsdump.c +++ b/drivers/acpi/acpica/nsdump.c @@ -181,6 +181,12 @@ acpi_ns_dump_one_object(acpi_handle obj_handle, } this_node = acpi_ns_map_handle_to_node(obj_handle); + if (!this_node) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Invalid object handle %p\n", + obj_handle)); + return (AE_OK); + } + type = this_node->type; /* Check if the owner matches */ -- cgit v1.2.3-59-g8ed1b From ac5f98db7be34cefc244026f882cf030debb7431 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 14:35:25 +0800 Subject: ACPICA: Allow OS override of all ACPI tables Previously, the table override mechanism was implemented for the DSDT only. Now, any table in the RSDT/XSDT can be replaced by the host OS. (including the DSDT). Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acglobal.h | 1 - drivers/acpi/acpica/tbutils.c | 59 ++++++++++++++++++++++++++++++++---------- drivers/acpi/acpica/tbxface.c | 31 +--------------------- include/acpi/actbl.h | 5 ++-- 4 files changed, 50 insertions(+), 46 deletions(-) diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index ddb40f5c68fc..634fb0785e7e 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -371,7 +371,6 @@ ACPI_EXTERN char *acpi_gbl_db_buffer; ACPI_EXTERN char *acpi_gbl_db_filename; ACPI_EXTERN u32 acpi_gbl_db_debug_level; ACPI_EXTERN u32 acpi_gbl_db_console_debug_level; -ACPI_EXTERN struct acpi_table_header *acpi_gbl_db_table_ptr; ACPI_EXTERN struct acpi_namespace_node *acpi_gbl_db_scope_node; /* diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index 22ce48985720..e285bedbb989 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -287,7 +287,10 @@ u8 acpi_tb_checksum(u8 *buffer, u32 length) * * RETURN: None * - * DESCRIPTION: Install an ACPI table into the global data structure. + * DESCRIPTION: Install an ACPI table into the global data structure. The + * table override mechanism is implemented here to allow the host + * OS to replace any table before it is installed in the root + * table array. * ******************************************************************************/ @@ -295,7 +298,10 @@ void acpi_tb_install_table(acpi_physical_address address, u8 flags, char *signature, u32 table_index) { - struct acpi_table_header *table; + acpi_status status; + struct acpi_table_header *table_to_install; + struct acpi_table_header *mapped_table; + struct acpi_table_header *override_table = NULL; if (!address) { ACPI_ERROR((AE_INFO, @@ -306,41 +312,68 @@ acpi_tb_install_table(acpi_physical_address address, /* Map just the table header */ - table = acpi_os_map_memory(address, sizeof(struct acpi_table_header)); - if (!table) { + mapped_table = + acpi_os_map_memory(address, sizeof(struct acpi_table_header)); + if (!mapped_table) { return; } - /* If a particular signature is expected, signature must match */ + /* If a particular signature is expected (DSDT/FACS), it must match */ - if (signature && !ACPI_COMPARE_NAME(table->signature, signature)) { + if (signature && !ACPI_COMPARE_NAME(mapped_table->signature, signature)) { ACPI_ERROR((AE_INFO, - "Invalid signature 0x%X for ACPI table [%s]", - *ACPI_CAST_PTR(u32, table->signature), signature)); + "Invalid signature 0x%X for ACPI table, expected [%s]", + *ACPI_CAST_PTR(u32, mapped_table->signature), + signature)); goto unmap_and_exit; } + /* + * ACPI Table Override: + * + * Before we install the table, let the host OS override it with a new + * one if desired. Any table within the RSDT/XSDT can be replaced, + * including the DSDT which is pointed to by the FADT. + */ + status = acpi_os_table_override(mapped_table, &override_table); + if (ACPI_SUCCESS(status) && override_table) { + ACPI_INFO((AE_INFO, + "%4.4s @ 0x%p Table override, replaced with:", + mapped_table->signature, ACPI_CAST_PTR(void, + address))); + + acpi_gbl_root_table_list.tables[table_index].pointer = + override_table; + flags = ACPI_TABLE_ORIGIN_OVERRIDE; + address = ACPI_PTR_TO_PHYSADDR(override_table); + + table_to_install = override_table; + } else { + table_to_install = mapped_table; + } + /* Initialize the table entry */ acpi_gbl_root_table_list.tables[table_index].address = address; - acpi_gbl_root_table_list.tables[table_index].length = table->length; + acpi_gbl_root_table_list.tables[table_index].length = + table_to_install->length; acpi_gbl_root_table_list.tables[table_index].flags = flags; ACPI_MOVE_32_TO_32(& (acpi_gbl_root_table_list.tables[table_index]. - signature), table->signature); + signature), table_to_install->signature); - acpi_tb_print_table_header(address, table); + acpi_tb_print_table_header(address, table_to_install); if (table_index == ACPI_TABLE_INDEX_DSDT) { /* Global integer width is based upon revision of the DSDT */ - acpi_ut_set_integer_width(table->revision); + acpi_ut_set_integer_width(table_to_install->revision); } unmap_and_exit: - acpi_os_unmap_memory(table, sizeof(struct acpi_table_header)); + acpi_os_unmap_memory(mapped_table, sizeof(struct acpi_table_header)); } /******************************************************************************* diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c index c3e841f3cde9..f3f95e386334 100644 --- a/drivers/acpi/acpica/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -491,7 +491,6 @@ ACPI_EXPORT_SYMBOL(acpi_get_table_by_index) static acpi_status acpi_tb_load_namespace(void) { acpi_status status; - struct acpi_table_header *table; u32 i; ACPI_FUNCTION_TRACE(tb_load_namespace); @@ -515,41 +514,13 @@ static acpi_status acpi_tb_load_namespace(void) goto unlock_and_exit; } - /* - * Find DSDT table - */ - status = - acpi_os_table_override(acpi_gbl_root_table_list. - tables[ACPI_TABLE_INDEX_DSDT].pointer, - &table); - if (ACPI_SUCCESS(status) && table) { - /* - * DSDT table has been found - */ - acpi_tb_delete_table(&acpi_gbl_root_table_list. - tables[ACPI_TABLE_INDEX_DSDT]); - acpi_gbl_root_table_list.tables[ACPI_TABLE_INDEX_DSDT].pointer = - table; - acpi_gbl_root_table_list.tables[ACPI_TABLE_INDEX_DSDT].length = - table->length; - acpi_gbl_root_table_list.tables[ACPI_TABLE_INDEX_DSDT].flags = - ACPI_TABLE_ORIGIN_UNKNOWN; - - ACPI_INFO((AE_INFO, "Table DSDT replaced by host OS")); - acpi_tb_print_table_header(0, table); - - if (no_auto_ssdt == 0) { - printk(KERN_WARNING "ACPI: DSDT override uses original SSDTs unless \"acpi_no_auto_ssdt\"\n"); - } - } + /* A valid DSDT is required */ status = acpi_tb_verify_table(&acpi_gbl_root_table_list. tables[ACPI_TABLE_INDEX_DSDT]); if (ACPI_FAILURE(status)) { - /* A valid DSDT is required */ - status = AE_NO_ACPI_TABLES; goto unlock_and_exit; } diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index bf8d4cfd8cf5..1f3dfdb1b370 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -310,8 +310,9 @@ struct acpi_table_desc { #define ACPI_TABLE_ORIGIN_UNKNOWN (0) #define ACPI_TABLE_ORIGIN_MAPPED (1) #define ACPI_TABLE_ORIGIN_ALLOCATED (2) -#define ACPI_TABLE_ORIGIN_MASK (3) -#define ACPI_TABLE_IS_LOADED (4) +#define ACPI_TABLE_ORIGIN_OVERRIDE (4) +#define ACPI_TABLE_ORIGIN_MASK (7) +#define ACPI_TABLE_IS_LOADED (8) /* * Get the remaining ACPI tables -- cgit v1.2.3-59-g8ed1b From 97cbb7d196845ec9a6c0e3cc33ec20503f8c4e73 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 14:41:03 +0800 Subject: ACPICA: Remove extraneous parameter in table manager Removed the Flags parameter from several internal functions since it was not being used. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/actables.h | 7 +++---- drivers/acpi/acpica/tbfadt.c | 7 +++---- drivers/acpi/acpica/tbutils.c | 14 +++++++------- drivers/acpi/acpica/tbxface.c | 3 +-- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h index 7ce6e33c7f78..d8f8c5df4fbc 100644 --- a/drivers/acpi/acpica/actables.h +++ b/drivers/acpi/acpica/actables.h @@ -49,7 +49,7 @@ acpi_status acpi_allocate_root_table(u32 initial_table_count); /* * tbfadt - FADT parse/convert/validate */ -void acpi_tb_parse_fadt(u32 table_index, u8 flags); +void acpi_tb_parse_fadt(u32 table_index); void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length); @@ -109,9 +109,8 @@ acpi_tb_verify_checksum(struct acpi_table_header *table, u32 length); void acpi_tb_install_table(acpi_physical_address address, - u8 flags, char *signature, u32 table_index); + char *signature, u32 table_index); -acpi_status -acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags); +acpi_status acpi_tb_parse_root_table(acpi_physical_address rsdp_address); #endif /* __ACTABLES_H__ */ diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 3636e4f8fb73..4b683ccd4a94 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -172,7 +172,6 @@ acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, * FUNCTION: acpi_tb_parse_fadt * * PARAMETERS: table_index - Index for the FADT - * Flags - Flags * * RETURN: None * @@ -181,7 +180,7 @@ acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, * ******************************************************************************/ -void acpi_tb_parse_fadt(u32 table_index, u8 flags) +void acpi_tb_parse_fadt(u32 table_index) { u32 length; struct acpi_table_header *table; @@ -219,10 +218,10 @@ void acpi_tb_parse_fadt(u32 table_index, u8 flags) /* Obtain the DSDT and FACS tables via their addresses within the FADT */ acpi_tb_install_table((acpi_physical_address) acpi_gbl_FADT.Xdsdt, - flags, ACPI_SIG_DSDT, ACPI_TABLE_INDEX_DSDT); + ACPI_SIG_DSDT, ACPI_TABLE_INDEX_DSDT); acpi_tb_install_table((acpi_physical_address) acpi_gbl_FADT.Xfacs, - flags, ACPI_SIG_FACS, ACPI_TABLE_INDEX_FACS); + ACPI_SIG_FACS, ACPI_TABLE_INDEX_FACS); } /******************************************************************************* diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index e285bedbb989..a0b424356b98 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -280,7 +280,6 @@ u8 acpi_tb_checksum(u8 *buffer, u32 length) * FUNCTION: acpi_tb_install_table * * PARAMETERS: Address - Physical address of DSDT or FACS - * Flags - Flags * Signature - Table signature, NULL if no need to * match * table_index - Index into root table array @@ -296,8 +295,9 @@ u8 acpi_tb_checksum(u8 *buffer, u32 length) void acpi_tb_install_table(acpi_physical_address address, - u8 flags, char *signature, u32 table_index) + char *signature, u32 table_index) { + u8 flags; acpi_status status; struct acpi_table_header *table_to_install; struct acpi_table_header *mapped_table; @@ -344,12 +344,13 @@ acpi_tb_install_table(acpi_physical_address address, acpi_gbl_root_table_list.tables[table_index].pointer = override_table; - flags = ACPI_TABLE_ORIGIN_OVERRIDE; address = ACPI_PTR_TO_PHYSADDR(override_table); table_to_install = override_table; + flags = ACPI_TABLE_ORIGIN_OVERRIDE; } else { table_to_install = mapped_table; + flags = ACPI_TABLE_ORIGIN_MAPPED; } /* Initialize the table entry */ @@ -435,7 +436,6 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 table_entry_size) * FUNCTION: acpi_tb_parse_root_table * * PARAMETERS: Rsdp - Pointer to the RSDP - * Flags - Flags * * RETURN: Status * @@ -449,7 +449,7 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 table_entry_size) ******************************************************************************/ acpi_status __init -acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags) +acpi_tb_parse_root_table(acpi_physical_address rsdp_address) { struct acpi_table_rsdp *rsdp; u32 table_entry_size; @@ -600,14 +600,14 @@ acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags) */ for (i = 2; i < acpi_gbl_root_table_list.count; i++) { acpi_tb_install_table(acpi_gbl_root_table_list.tables[i]. - address, flags, NULL, i); + address, NULL, i); /* Special case for FADT - get the DSDT and FACS */ if (ACPI_COMPARE_NAME (&acpi_gbl_root_table_list.tables[i].signature, ACPI_SIG_FADT)) { - acpi_tb_parse_fadt(i, flags); + acpi_tb_parse_fadt(i); } } diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c index f3f95e386334..416d01d9a970 100644 --- a/drivers/acpi/acpica/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -150,8 +150,7 @@ acpi_initialize_tables(struct acpi_table_desc * initial_table_array, * Root Table Array. This array contains the information of the RSDT/XSDT * in a common, more useable format. */ - status = - acpi_tb_parse_root_table(rsdp_address, ACPI_TABLE_ORIGIN_MAPPED); + status = acpi_tb_parse_root_table(rsdp_address); return_ACPI_STATUS(status); } -- cgit v1.2.3-59-g8ed1b From d3ccaff827cef5a5c5a0f3c97e1e2e6d99f618cb Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 14:43:04 +0800 Subject: ACPICA: Add override for dynamic tables Add a call to acpi_os_table_override during the installation of a dynamic table (loaded via the Load or LoadTable AML operators). Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/tbinstal.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index 37374b21969d..ef269a297b57 100644 --- a/drivers/acpi/acpica/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -103,7 +103,9 @@ acpi_status acpi_tb_verify_table(struct acpi_table_desc *table_desc) * * RETURN: Status * - * DESCRIPTION: This function is called to add the ACPI table + * DESCRIPTION: This function is called to add an ACPI table. It is used to + * dynamically load tables via the Load and load_table AML + * operators. * ******************************************************************************/ @@ -112,6 +114,7 @@ acpi_tb_add_table(struct acpi_table_desc *table_desc, u32 *table_index) { u32 i; acpi_status status = AE_OK; + struct acpi_table_header *override_table = NULL; ACPI_FUNCTION_TRACE(tb_add_table); @@ -201,6 +204,29 @@ acpi_tb_add_table(struct acpi_table_desc *table_desc, u32 *table_index) } } + /* + * ACPI Table Override: + * Allow the host to override dynamically loaded tables. + */ + status = acpi_os_table_override(table_desc->pointer, &override_table); + if (ACPI_SUCCESS(status) && override_table) { + ACPI_INFO((AE_INFO, + "%4.4s @ 0x%p Table override, replaced with:", + table_desc->pointer->signature, + ACPI_CAST_PTR(void, table_desc->address))); + + /* We can delete the table that was passed as a parameter */ + + acpi_tb_delete_table(table_desc); + + /* Setup descriptor for the new table */ + + table_desc->address = ACPI_PTR_TO_PHYSADDR(override_table); + table_desc->pointer = override_table; + table_desc->length = override_table->length; + table_desc->flags = ACPI_TABLE_ORIGIN_OVERRIDE; + } + /* Add the table to the global root table list */ status = acpi_tb_store_table(table_desc->address, table_desc->pointer, -- cgit v1.2.3-59-g8ed1b From 993958fecab6eabc6ee5b7ed65c56d716fd5cfe3 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 15:14:33 +0800 Subject: ACPICA: Update FADT flag definitions Add new flags in the Boot Architecture flags field. Update comments for all FADT flags. Add FADT version when each flag was defined. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/pci/pci-acpi.c | 4 +-- include/acpi/actbl.h | 69 +++++++++++++++++++++++++------------------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index deea8a187eb8..368ca72dffbc 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -386,12 +386,12 @@ static int __init acpi_pci_init(void) { int ret; - if (acpi_gbl_FADT.boot_flags & BAF_MSI_NOT_SUPPORTED) { + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) { printk(KERN_INFO"ACPI FADT declares the system doesn't support MSI, so disable it\n"); pci_no_msi(); } - if (acpi_gbl_FADT.boot_flags & BAF_PCIE_ASPM_CONTROL) { + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_ASPM) { printk(KERN_INFO"ACPI FADT declares the system doesn't support PCIe ASPM, so disable it\n"); pcie_no_aspm(); } diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 1f3dfdb1b370..222733d01f36 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -214,11 +214,11 @@ struct acpi_table_fadt { u16 flush_size; /* Processor's memory cache line width, in bytes */ u16 flush_stride; /* Number of flush strides that need to be read */ u8 duty_offset; /* Processor duty cycle index in processor's P_CNT reg */ - u8 duty_width; /* Processor duty cycle value bit width in P_CNT register. */ + u8 duty_width; /* Processor duty cycle value bit width in P_CNT register */ u8 day_alarm; /* Index to day-of-month alarm in RTC CMOS RAM */ u8 month_alarm; /* Index to month-of-year alarm in RTC CMOS RAM */ u8 century; /* Index to century in RTC CMOS RAM */ - u16 boot_flags; /* IA-PC Boot Architecture Flags. See Table 5-10 for description */ + u16 boot_flags; /* IA-PC Boot Architecture Flags (see below for individual flags) */ u8 reserved; /* Reserved, must be zero */ u32 flags; /* Miscellaneous flag bits (see below for individual flags) */ struct acpi_generic_address reset_register; /* 64-bit address of the Reset register */ @@ -236,32 +236,41 @@ struct acpi_table_fadt { struct acpi_generic_address xgpe1_block; /* 64-bit Extended General Purpose Event 1 Reg Blk address */ }; +/* FADT Boot Architecture Flags (boot_flags) */ + +#define ACPI_FADT_LEGACY_DEVICES (1) /* 00: [V2] System has LPC or ISA bus devices */ +#define ACPI_FADT_8042 (1<<1) /* 01: [V3] System has an 8042 controller on port 60/64 */ +#define ACPI_FADT_NO_VGA (1<<2) /* 02: [V4] It is not safe to probe for VGA hardware */ +#define ACPI_FADT_NO_MSI (1<<3) /* 03: [V4] Message Signaled Interrupts (MSI) must not be enabled */ +#define ACPI_FADT_NO_ASPM (1<<4) /* 04: [V4] PCIe ASPM control must not be enabled */ + +#define FADT2_REVISION_ID 3 + /* FADT flags */ -#define ACPI_FADT_WBINVD (1) /* 00: The wbinvd instruction works properly */ -#define ACPI_FADT_WBINVD_FLUSH (1<<1) /* 01: The wbinvd flushes but does not invalidate */ -#define ACPI_FADT_C1_SUPPORTED (1<<2) /* 02: All processors support C1 state */ -#define ACPI_FADT_C2_MP_SUPPORTED (1<<3) /* 03: C2 state works on MP system */ -#define ACPI_FADT_POWER_BUTTON (1<<4) /* 04: Power button is handled as a generic feature */ -#define ACPI_FADT_SLEEP_BUTTON (1<<5) /* 05: Sleep button is handled as a generic feature, or not present */ -#define ACPI_FADT_FIXED_RTC (1<<6) /* 06: RTC wakeup stat not in fixed register space */ -#define ACPI_FADT_S4_RTC_WAKE (1<<7) /* 07: RTC wakeup possible from S4 */ -#define ACPI_FADT_32BIT_TIMER (1<<8) /* 08: tmr_val is 32 bits 0=24-bits */ -#define ACPI_FADT_DOCKING_SUPPORTED (1<<9) /* 09: Docking supported */ -#define ACPI_FADT_RESET_REGISTER (1<<10) /* 10: System reset via the FADT RESET_REG supported */ -#define ACPI_FADT_SEALED_CASE (1<<11) /* 11: No internal expansion capabilities and case is sealed */ -#define ACPI_FADT_HEADLESS (1<<12) /* 12: No local video capabilities or local input devices */ -#define ACPI_FADT_SLEEP_TYPE (1<<13) /* 13: Must execute native instruction after writing SLP_TYPx register */ -#define ACPI_FADT_PCI_EXPRESS_WAKE (1<<14) /* 14: System supports PCIEXP_WAKE (STS/EN) bits (ACPI 3.0) */ -#define ACPI_FADT_PLATFORM_CLOCK (1<<15) /* 15: OSPM should use platform-provided timer (ACPI 3.0) */ -#define ACPI_FADT_S4_RTC_VALID (1<<16) /* 16: Contents of RTC_STS valid after S4 wake (ACPI 3.0) */ -#define ACPI_FADT_REMOTE_POWER_ON (1<<17) /* 17: System is compatible with remote power on (ACPI 3.0) */ -#define ACPI_FADT_APIC_CLUSTER (1<<18) /* 18: All local APICs must use cluster model (ACPI 3.0) */ -#define ACPI_FADT_APIC_PHYSICAL (1<<19) /* 19: All local x_aPICs must use physical dest mode (ACPI 3.0) */ +#define ACPI_FADT_WBINVD (1) /* 00: [V1] The wbinvd instruction works properly */ +#define ACPI_FADT_WBINVD_FLUSH (1<<1) /* 01: [V1] wbinvd flushes but does not invalidate caches */ +#define ACPI_FADT_C1_SUPPORTED (1<<2) /* 02: [V1] All processors support C1 state */ +#define ACPI_FADT_C2_MP_SUPPORTED (1<<3) /* 03: [V1] C2 state works on MP system */ +#define ACPI_FADT_POWER_BUTTON (1<<4) /* 04: [V1] Power button is handled as a control method device */ +#define ACPI_FADT_SLEEP_BUTTON (1<<5) /* 05: [V1] Sleep button is handled as a control method device */ +#define ACPI_FADT_FIXED_RTC (1<<6) /* 06: [V1] RTC wakeup status not in fixed register space */ +#define ACPI_FADT_S4_RTC_WAKE (1<<7) /* 07: [V1] RTC alarm can wake system from S4 */ +#define ACPI_FADT_32BIT_TIMER (1<<8) /* 08: [V1] ACPI timer width is 32-bit (0=24-bit) */ +#define ACPI_FADT_DOCKING_SUPPORTED (1<<9) /* 09: [V1] Docking supported */ +#define ACPI_FADT_RESET_REGISTER (1<<10) /* 10: [V2] System reset via the FADT RESET_REG supported */ +#define ACPI_FADT_SEALED_CASE (1<<11) /* 11: [V3] No internal expansion capabilities and case is sealed */ +#define ACPI_FADT_HEADLESS (1<<12) /* 12: [V3] No local video capabilities or local input devices */ +#define ACPI_FADT_SLEEP_TYPE (1<<13) /* 13: [V3] Must execute native instruction after writing SLP_TYPx register */ +#define ACPI_FADT_PCI_EXPRESS_WAKE (1<<14) /* 14: [V4] System supports PCIEXP_WAKE (STS/EN) bits (ACPI 3.0) */ +#define ACPI_FADT_PLATFORM_CLOCK (1<<15) /* 15: [V4] OSPM should use platform-provided timer (ACPI 3.0) */ +#define ACPI_FADT_S4_RTC_VALID (1<<16) /* 16: [V4] Contents of RTC_STS valid after S4 wake (ACPI 3.0) */ +#define ACPI_FADT_REMOTE_POWER_ON (1<<17) /* 17: [V4] System is compatible with remote power on (ACPI 3.0) */ +#define ACPI_FADT_APIC_CLUSTER (1<<18) /* 18: [V4] All local APICs must use cluster model (ACPI 3.0) */ +#define ACPI_FADT_APIC_PHYSICAL (1<<19) /* 19: [V4] All local x_aPICs must use physical dest mode (ACPI 3.0) */ + +/* FADT Prefered Power Management Profiles */ -/* - * FADT Prefered Power Management Profiles - */ enum acpi_prefered_pm_profiles { PM_UNSPECIFIED = 0, PM_DESKTOP = 1, @@ -272,16 +281,6 @@ enum acpi_prefered_pm_profiles { PM_APPLIANCE_PC = 6 }; -/* FADT Boot Arch Flags */ - -#define BAF_LEGACY_DEVICES 0x0001 -#define BAF_8042_KEYBOARD_CONTROLLER 0x0002 -#define BAF_MSI_NOT_SUPPORTED 0x0008 -#define BAF_PCIE_ASPM_CONTROL 0x0010 - -#define FADT2_REVISION_ID 3 -#define FADT2_MINUS_REVISION_ID 2 - /* Reset to default packing */ #pragma pack() -- cgit v1.2.3-59-g8ed1b From 6fc69d8beb0c16311f737df2c6f677057d50ab05 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Tue, 3 Feb 2009 15:16:51 +0800 Subject: ACPICA: Update version to 20090123 Update version to 20090123. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index c8e8cf45830f..8f9df6235255 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20081204 +#define ACPI_CA_VERSION 0x20090123 #include "actypes.h" #include "actbl.h" -- cgit v1.2.3-59-g8ed1b From 531c633d2be8e79087335a46d3c017ca5837e588 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:06:12 +0800 Subject: ACPICA: Split out PM1 status registers from the FADT Add new globals for the PM1 status registers (A/B), similar to the way the PM1 enable registers are handled. Instead of overloading the FADT Event Register blocks. This makes the code clearer and less prone to error. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acglobal.h | 5 +- drivers/acpi/acpica/hwregs.c | 18 +-- drivers/acpi/acpica/tbfadt.c | 246 ++++++++++++++++++++++------------------- 3 files changed, 147 insertions(+), 122 deletions(-) diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 634fb0785e7e..f3e87ba43dbe 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -148,9 +148,12 @@ ACPI_EXTERN struct acpi_internal_rsdt acpi_gbl_root_table_list; ACPI_EXTERN struct acpi_table_fadt acpi_gbl_FADT; ACPI_EXTERN struct acpi_table_facs *acpi_gbl_FACS; -/* These addresses are calculated from FADT address values */ +/* These addresses are calculated from the FADT Event Block addresses */ +ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1a_status; ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1a_enable; + +ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1b_status; ACPI_EXTERN struct acpi_generic_address acpi_gbl_xpm1b_enable; /* diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 4dc43b018517..7ef0b8eadbc7 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -72,21 +72,23 @@ acpi_status acpi_hw_clear_acpi_status(void) ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n", ACPI_BITMASK_ALL_FIXED_STATUS, - (u16) acpi_gbl_FADT.xpm1a_event_block.address)); + (u16) acpi_gbl_xpm1a_status.address)); lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + /* Clear the fixed events */ + status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, ACPI_BITMASK_ALL_FIXED_STATUS); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } - /* Clear the fixed events */ + /* Write PM1B register if present */ - if (acpi_gbl_FADT.xpm1b_event_block.address) { + if (acpi_gbl_xpm1b_status.address) { status = acpi_write(ACPI_BITMASK_ALL_FIXED_STATUS, - &acpi_gbl_FADT.xpm1b_event_block); + &acpi_gbl_xpm1b_status); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } @@ -150,14 +152,14 @@ acpi_hw_register_read(u32 register_id, u32 * return_value) switch (register_id) { case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ - status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_event_block); + status = acpi_read(&value1, &acpi_gbl_xpm1a_status); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ - status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_event_block); + status = acpi_read(&value2, &acpi_gbl_xpm1b_status); value1 |= value2; break; @@ -267,14 +269,14 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) /* Now we can write the data */ - status = acpi_write(value, &acpi_gbl_FADT.xpm1a_event_block); + status = acpi_write(value, &acpi_gbl_xpm1a_status); if (ACPI_FAILURE(status)) { goto exit; } /* PM1B is optional */ - status = acpi_write(value, &acpi_gbl_FADT.xpm1b_event_block); + status = acpi_write(value, &acpi_gbl_xpm1b_status); break; case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 4b683ccd4a94..a8191efd9aa6 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -57,6 +57,8 @@ static void acpi_tb_convert_fadt(void); static void acpi_tb_validate_fadt(void); +static void acpi_tb_setup_fadt_registers(void); + /* Table for conversion of FADT to common internal format and FADT validation */ typedef struct acpi_fadt_info { @@ -132,6 +134,35 @@ static struct acpi_fadt_info fadt_info_table[] = { #define ACPI_FADT_INFO_ENTRIES (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info)) +/* Table used to split Event Blocks into separate status/enable registers */ + +typedef struct acpi_fadt_pm_info { + struct acpi_generic_address *target; + u8 source; + u8 register_num; + +} acpi_fadt_pm_info; + +static struct acpi_fadt_pm_info fadt_pm_info_table[] = { + {&acpi_gbl_xpm1a_status, + ACPI_FADT_OFFSET(xpm1a_event_block), + 0}, + + {&acpi_gbl_xpm1a_enable, + ACPI_FADT_OFFSET(xpm1a_event_block), + 1}, + + {&acpi_gbl_xpm1b_status, + ACPI_FADT_OFFSET(xpm1b_event_block), + 0}, + + {&acpi_gbl_xpm1b_enable, + ACPI_FADT_OFFSET(xpm1b_event_block), + 1} +}; + +#define ACPI_FADT_PM_INFO_ENTRIES (sizeof (fadt_pm_info_table) / sizeof (struct acpi_fadt_pm_info)) + /******************************************************************************* * * FUNCTION: acpi_tb_init_generic_address @@ -207,7 +238,7 @@ void acpi_tb_parse_fadt(u32 table_index) */ (void)acpi_tb_verify_checksum(table, length); - /* Obtain a local copy of the FADT in common ACPI 2.0+ format */ + /* Create a local copy of the FADT in common ACPI 2.0+ format */ acpi_tb_create_local_fadt(table, length); @@ -265,11 +296,17 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) ACPI_MEMCPY(&acpi_gbl_FADT, table, ACPI_MIN(length, sizeof(struct acpi_table_fadt))); - /* - * 1) Convert the local copy of the FADT to the common internal format - * 2) Validate some of the important values within the FADT - */ + /* Convert the local copy of the FADT to the common internal format */ + acpi_tb_convert_fadt(); + + /* Validate FADT values now, before we make any changes */ + + acpi_tb_validate_fadt(); + + /* Initialize the global ACPI register structures */ + + acpi_tb_setup_fadt_registers(); } /******************************************************************************* @@ -303,8 +340,6 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) static void acpi_tb_convert_fadt(void) { - u8 pm1_register_bit_width; - u8 pm1_register_byte_width; struct acpi_generic_address *target64; u32 i; @@ -379,112 +414,6 @@ static void acpi_tb_convert_fadt(void) address32)); } } - - /* Validate FADT values now, before we make any changes */ - - acpi_tb_validate_fadt(); - - /* - * Optionally check all register lengths against the default values and - * update them if they are incorrect. - */ - if (acpi_gbl_use_default_register_widths) { - for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { - target64 = - ACPI_ADD_PTR(struct acpi_generic_address, - &acpi_gbl_FADT, - fadt_info_table[i].address64); - - /* - * If a valid register (Address != 0) and the (default_length > 0) - * (Not a GPE register), then check the width against the default. - */ - if ((target64->address) && - (fadt_info_table[i].default_length > 0) && - (fadt_info_table[i].default_length != - target64->bit_width)) { - ACPI_WARNING((AE_INFO, - "Invalid length for %s: %d, using default %d", - fadt_info_table[i].name, - target64->bit_width, - fadt_info_table[i]. - default_length)); - - /* Incorrect size, set width to the default */ - - target64->bit_width = - fadt_info_table[i].default_length; - } - } - } - - /* - * Get the length of the individual PM1 registers (enable and status). - * Each register is defined to be (event block length / 2). - */ - pm1_register_bit_width = - (u8)ACPI_DIV_2(acpi_gbl_FADT.xpm1a_event_block.bit_width); - pm1_register_byte_width = (u8)ACPI_DIV_8(pm1_register_bit_width); - - /* - * Adjust the lengths of the PM1 Event Blocks so that they can be used to - * access the PM1 status register(s). Use (width / 2) - */ - acpi_gbl_FADT.xpm1a_event_block.bit_width = pm1_register_bit_width; - acpi_gbl_FADT.xpm1b_event_block.bit_width = pm1_register_bit_width; - - /* - * Calculate separate GAS structs for the PM1 Enable registers. - * These addresses do not appear (directly) in the FADT, so it is - * useful to calculate them once, here. - * - * The PM event blocks are split into two register blocks, first is the - * PM Status Register block, followed immediately by the PM Enable - * Register block. Each is of length (xpm1x_event_block.bit_width/2). - * - * On various systems the v2 fields (and particularly the bit widths) - * cannot be relied upon, though. Hence resort to using the v1 length - * here (and warn about the inconsistency). - */ - if (acpi_gbl_FADT.xpm1a_event_block.bit_width - != acpi_gbl_FADT.pm1_event_length * 8) - printk(KERN_WARNING "FADT: " - "X_PM1a_EVT_BLK.bit_width (%u) does not match" - " PM1_EVT_LEN (%u)\n", - acpi_gbl_FADT.xpm1a_event_block.bit_width, - acpi_gbl_FADT.pm1_event_length); - - /* The PM1A register block is required */ - - acpi_tb_init_generic_address(&acpi_gbl_xpm1a_enable, - acpi_gbl_FADT.xpm1a_event_block.space_id, - pm1_register_byte_width, - (acpi_gbl_FADT.xpm1a_event_block.address + - pm1_register_byte_width)); - /* Don't forget to copy space_id of the GAS */ - acpi_gbl_xpm1a_enable.space_id = - acpi_gbl_FADT.xpm1a_event_block.space_id; - - /* The PM1B register block is optional, ignore if not present */ - - if (acpi_gbl_FADT.xpm1b_event_block.address) { - if (acpi_gbl_FADT.xpm1b_event_block.bit_width - != acpi_gbl_FADT.pm1_event_length * 8) - printk(KERN_WARNING "FADT: " - "X_PM1b_EVT_BLK.bit_width (%u) does not match" - " PM1_EVT_LEN (%u)\n", - acpi_gbl_FADT.xpm1b_event_block.bit_width, - acpi_gbl_FADT.pm1_event_length); - acpi_tb_init_generic_address(&acpi_gbl_xpm1b_enable, - acpi_gbl_FADT.xpm1b_event_block.space_id, - pm1_register_byte_width, - (acpi_gbl_FADT.xpm1b_event_block. - address + pm1_register_byte_width)); - /* Don't forget to copy space_id of the GAS */ - acpi_gbl_xpm1b_enable.space_id = - acpi_gbl_FADT.xpm1b_event_block.space_id; - - } } /****************************************************************************** @@ -607,3 +536,94 @@ static void acpi_tb_validate_fadt(void) } } } + +/****************************************************************************** + * + * FUNCTION: acpi_tb_setup_fadt_registers + * + * PARAMETERS: None, uses acpi_gbl_FADT. + * + * RETURN: None + * + * DESCRIPTION: Initialize global ACPI PM1 register definitions. Optionally, + * force FADT register definitions to their default lengths. + * + ******************************************************************************/ + +static void acpi_tb_setup_fadt_registers(void) +{ + struct acpi_generic_address *target64; + struct acpi_generic_address *source64; + u8 pm1_register_byte_width; + u32 i; + + /* + * Optionally check all register lengths against the default values and + * update them if they are incorrect. + */ + if (acpi_gbl_use_default_register_widths) { + for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { + target64 = + ACPI_ADD_PTR(struct acpi_generic_address, + &acpi_gbl_FADT, + fadt_info_table[i].address64); + + /* + * If a valid register (Address != 0) and the (default_length > 0) + * (Not a GPE register), then check the width against the default. + */ + if ((target64->address) && + (fadt_info_table[i].default_length > 0) && + (fadt_info_table[i].default_length != + target64->bit_width)) { + ACPI_WARNING((AE_INFO, + "Invalid length for %s: %d, using default %d", + fadt_info_table[i].name, + target64->bit_width, + fadt_info_table[i]. + default_length)); + + /* Incorrect size, set width to the default */ + + target64->bit_width = + fadt_info_table[i].default_length; + } + } + } + + /* + * Get the length of the individual PM1 registers (enable and status). + * Each register is defined to be (event block length / 2). Extra divide + * by 8 converts bits to bytes. + */ + pm1_register_byte_width = + (u8)ACPI_DIV_16(acpi_gbl_FADT.xpm1a_event_block.bit_width); + + /* + * Calculate separate GAS structs for the PM1x (A/B) Status and Enable + * registers. These addresses do not appear (directly) in the FADT, so it + * is useful to pre-calculate them from the PM1 Event Block definitions. + * + * The PM event blocks are split into two register blocks, first is the + * PM Status Register block, followed immediately by the PM Enable + * Register block. Each is of length (pm1_event_length/2) + * + * Note: The PM1A event block is required by the ACPI specification. + * However, the PM1B event block is optional and is rarely, if ever, + * used. + */ + + for (i = 0; i < ACPI_FADT_PM_INFO_ENTRIES; i++) { + source64 = + ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, + fadt_pm_info_table[i].source); + + acpi_tb_init_generic_address(fadt_pm_info_table[i].target, + source64->space_id, + pm1_register_byte_width, + source64->address + + (fadt_pm_info_table[i]. + register_num * + pm1_register_byte_width)); + } +} -- cgit v1.2.3-59-g8ed1b From 5e053e77f233342b56fda419d347fd2c958b9849 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Wed, 4 Mar 2009 14:31:25 +0800 Subject: ACPICA: Check for non-zero address before being converted to GAS Reported-by: FreeBSD community Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/tbfadt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index a8191efd9aa6..43fe886b41a2 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -618,12 +618,14 @@ static void acpi_tb_setup_fadt_registers(void) ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, fadt_pm_info_table[i].source); - acpi_tb_init_generic_address(fadt_pm_info_table[i].target, - source64->space_id, - pm1_register_byte_width, - source64->address + - (fadt_pm_info_table[i]. - register_num * - pm1_register_byte_width)); + if (source64->address) { + acpi_tb_init_generic_address(fadt_pm_info_table[i]. + target, source64->space_id, + pm1_register_byte_width, + source64->address + + (fadt_pm_info_table[i]. + register_num * + pm1_register_byte_width)); + } } } -- cgit v1.2.3-59-g8ed1b From d3319d1717a250e92be66a487dc3e0429112c284 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:07:58 +0800 Subject: ACPICA: Update comments in module header Enhance the explanations of the various package return types for clarity. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acpredef.h | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 16a9ca9a66e4..63f656ae3604 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -52,41 +52,44 @@ * 1) PTYPE1 packages do not contain sub-packages. * * ACPI_PTYPE1_FIXED: Fixed length, 1 or 2 object types: - * object type - * count - * object type - * count + * object type + * count + * object type + * count * * ACPI_PTYPE1_VAR: Variable length: - * object type (Int/Buf/Ref) + * object type (Int/Buf/Ref) * - * ACPI_PTYPE1_OPTION: Package has some required and some optional elements: - * Used for _PRW + * ACPI_PTYPE1_OPTION: Package has some required and some optional elements + * (Used for _PRW) * * * 2) PTYPE2 packages contain a variable number of sub-packages. Each of the * different types describe the contents of each of the sub-packages. * * ACPI_PTYPE2: Each subpackage contains 1 or 2 object types: - * object type - * count - * object type - * count + * object type + * count + * object type + * count + * (Used for _ALR,_MLS,_PSS,_TRT,_TSS) * * ACPI_PTYPE2_COUNT: Each subpackage has a count as first element: - * object type + * object type + * (Used for _CSD,_PSD,_TSD) * * ACPI_PTYPE2_PKG_COUNT: Count of subpackages at start, 1 or 2 object types: - * object type - * count - * object type - * count + * object type + * count + * object type + * count + * (Used for _CST) * - * ACPI_PTYPE2_FIXED: Each subpackage is of fixed length: - * Used for _PRT + * ACPI_PTYPE2_FIXED: Each subpackage is of fixed length + * (Used for _PRT) * * ACPI_PTYPE2_MIN: Each subpackage has a variable but minimum length - * Used for _HPX + * (Used for _HPX) * *****************************************************************************/ -- cgit v1.2.3-59-g8ed1b From c520abadbc56a2740021910d2c6412f826a10059 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:20:12 +0800 Subject: ACPICA: Fix writes to optional PM1B registers On read, shift B register bits above the A bits. On write, shift B bits down to zero before writing the B register. New: acpi_hw_read_multiple, acpi_hw_write_multiple. These two functions now transparently handle the (possible) split registers for PM1 Status, Enable, and Control. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwregs.c | 205 +++++++++++++++++++++++++++++-------------- 1 file changed, 140 insertions(+), 65 deletions(-) diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 7ef0b8eadbc7..41f1173e02c2 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -51,6 +51,17 @@ #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwregs") +/* Local Prototypes */ +static acpi_status +acpi_hw_read_multiple(u32 *value, + struct acpi_generic_address *register_a, + struct acpi_generic_address *register_b); + +static acpi_status +acpi_hw_write_multiple(u32 value, + struct acpi_generic_address *register_a, + struct acpi_generic_address *register_b); + /******************************************************************************* * * FUNCTION: acpi_hw_clear_acpi_status @@ -63,6 +74,7 @@ ACPI_MODULE_NAME("hwregs") * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED * ******************************************************************************/ + acpi_status acpi_hw_clear_acpi_status(void) { acpi_status status; @@ -143,64 +155,49 @@ struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id) acpi_status acpi_hw_register_read(u32 register_id, u32 * return_value) { - u32 value1 = 0; - u32 value2 = 0; + u32 value = 0; acpi_status status; ACPI_FUNCTION_TRACE(hw_register_read); switch (register_id) { - case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ + case ACPI_REGISTER_PM1_STATUS: /* PM1 A/B: 16-bit access each */ - status = acpi_read(&value1, &acpi_gbl_xpm1a_status); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = acpi_read(&value2, &acpi_gbl_xpm1b_status); - value1 |= value2; + status = acpi_hw_read_multiple(&value, + &acpi_gbl_xpm1a_status, + &acpi_gbl_xpm1b_status); break; - case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ + case ACPI_REGISTER_PM1_ENABLE: /* PM1 A/B: 16-bit access each */ - status = acpi_read(&value1, &acpi_gbl_xpm1a_enable); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = acpi_read(&value2, &acpi_gbl_xpm1b_enable); - value1 |= value2; + status = acpi_hw_read_multiple(&value, + &acpi_gbl_xpm1a_enable, + &acpi_gbl_xpm1b_enable); break; - case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ - - status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_control_block); - if (ACPI_FAILURE(status)) { - goto exit; - } + case ACPI_REGISTER_PM1_CONTROL: /* PM1 A/B: 16-bit access each */ - status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_control_block); - value1 |= value2; + status = acpi_hw_read_multiple(&value, + &acpi_gbl_FADT. + xpm1a_control_block, + &acpi_gbl_FADT. + xpm1b_control_block); break; case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ - status = acpi_read(&value1, &acpi_gbl_FADT.xpm2_control_block); + status = acpi_read(&value, &acpi_gbl_FADT.xpm2_control_block); break; case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ - status = acpi_read(&value1, &acpi_gbl_FADT.xpm_timer_block); + status = acpi_read(&value, &acpi_gbl_FADT.xpm_timer_block); break; case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ status = - acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8); + acpi_os_read_port(acpi_gbl_FADT.smi_command, &value, 8); break; default: @@ -209,10 +206,8 @@ acpi_hw_register_read(u32 register_id, u32 * return_value) break; } - exit: - if (ACPI_SUCCESS(status)) { - *return_value = value1; + *return_value = value; } return_ACPI_STATUS(status); @@ -252,12 +247,13 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) ACPI_FUNCTION_TRACE(hw_register_write); switch (register_id) { - case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ + case ACPI_REGISTER_PM1_STATUS: /* PM1 A/B: 16-bit access each */ /* Perform a read first to preserve certain bits (per ACPI spec) */ - status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, - &read_value); + status = acpi_hw_read_multiple(&read_value, + &acpi_gbl_xpm1a_status, + &acpi_gbl_xpm1b_status); if (ACPI_FAILURE(status)) { goto exit; } @@ -269,35 +265,29 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) /* Now we can write the data */ - status = acpi_write(value, &acpi_gbl_xpm1a_status); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = acpi_write(value, &acpi_gbl_xpm1b_status); + status = acpi_hw_write_multiple(value, + &acpi_gbl_xpm1a_status, + &acpi_gbl_xpm1b_status); break; - case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ + case ACPI_REGISTER_PM1_ENABLE: /* PM1 A/B: 16-bit access */ - status = acpi_write(value, &acpi_gbl_xpm1a_enable); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = acpi_write(value, &acpi_gbl_xpm1b_enable); + status = acpi_hw_write_multiple(value, + &acpi_gbl_xpm1a_enable, + &acpi_gbl_xpm1b_enable); break; - case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ + case ACPI_REGISTER_PM1_CONTROL: /* PM1 A/B: 16-bit access each */ /* * Perform a read first to preserve certain bits (per ACPI spec) + * Note: This includes SCI_EN, we never want to change this bit */ - status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, - &read_value); + status = acpi_hw_read_multiple(&read_value, + &acpi_gbl_FADT. + xpm1a_control_block, + &acpi_gbl_FADT. + xpm1b_control_block); if (ACPI_FAILURE(status)) { goto exit; } @@ -309,12 +299,11 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) /* Now we can write the data */ - status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block); - if (ACPI_FAILURE(status)) { - goto exit; - } - - status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block); + status = acpi_hw_write_multiple(value, + &acpi_gbl_FADT. + xpm1a_control_block, + &acpi_gbl_FADT. + xpm1b_control_block); break; case ACPI_REGISTER_PM1A_CONTROL: /* 16-bit access */ @@ -346,6 +335,7 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) break; default: + ACPI_ERROR((AE_INFO, "Unknown Register ID: %X", register_id)); status = AE_BAD_PARAMETER; break; } @@ -353,3 +343,88 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) exit: return_ACPI_STATUS(status); } + +/****************************************************************************** + * + * FUNCTION: acpi_hw_read_multiple + * + * PARAMETERS: Value - Where the register value is returned + * register_a - First ACPI register (required) + * register_b - Second ACPI register (optional) + * + * RETURN: Status + * + * DESCRIPTION: Read from the specified two-part ACPI register (such as PM1 A/B) + * + ******************************************************************************/ + +static acpi_status +acpi_hw_read_multiple(u32 *value, + struct acpi_generic_address *register_a, + struct acpi_generic_address *register_b) +{ + u32 value_a = 0; + u32 value_b = 0; + acpi_status status; + + /* The first register is always required */ + + status = acpi_read(&value_a, register_a); + if (ACPI_FAILURE(status)) { + return (status); + } + + /* Second register is optional */ + + if (register_b->address) { + status = acpi_read(&value_b, register_b); + if (ACPI_FAILURE(status)) { + return (status); + } + } + + /* Shift the B bits above the A bits */ + + *value = value_a | (value_b << register_a->bit_width); + return (AE_OK); +} + +/****************************************************************************** + * + * FUNCTION: acpi_hw_write_multiple + * + * PARAMETERS: Value - The value to write + * register_a - First ACPI register (required) + * register_b - Second ACPI register (optional) + * + * RETURN: Status + * + * DESCRIPTION: Write to the specified two-part ACPI register (such as PM1 A/B) + * + ******************************************************************************/ + +static acpi_status +acpi_hw_write_multiple(u32 value, + struct acpi_generic_address *register_a, + struct acpi_generic_address *register_b) +{ + acpi_status status; + + /* The first register is always required */ + + status = acpi_write(value, register_a); + if (ACPI_FAILURE(status)) { + return (status); + } + + /* Second register is optional */ + + if (register_b->address) { + + /* Normalize the B bits before write */ + + status = acpi_write(value >> register_a->bit_width, register_b); + } + + return (status); +} -- cgit v1.2.3-59-g8ed1b From 227243a04d645377d09eda0dc8501e0d9c26ab89 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:24:50 +0800 Subject: ACPICA: Remove extra write for acpi_hw_clear_acpi_status This function was writing an optional PM1B status register twice. The existing call to the low-level acpi_hw_register_write automatically handles a possibly split PM1 A/B register. ACPICA BZ 751. http://www.acpica.org/bugzilla/show_bug.cgi?id=751 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwregs.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 41f1173e02c2..9c8162128c2b 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -71,7 +71,6 @@ acpi_hw_write_multiple(u32 value, * RETURN: Status * * DESCRIPTION: Clears all fixed and general purpose status bits - * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED * ******************************************************************************/ @@ -82,13 +81,13 @@ acpi_status acpi_hw_clear_acpi_status(void) ACPI_FUNCTION_TRACE(hw_clear_acpi_status); - ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n", + ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %0llX\n", ACPI_BITMASK_ALL_FIXED_STATUS, - (u16) acpi_gbl_xpm1a_status.address)); + acpi_gbl_xpm1a_status.address)); lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); - /* Clear the fixed events */ + /* Clear the fixed events in PM1 A/B */ status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, ACPI_BITMASK_ALL_FIXED_STATUS); @@ -96,16 +95,6 @@ acpi_status acpi_hw_clear_acpi_status(void) goto unlock_and_exit; } - /* Write PM1B register if present */ - - if (acpi_gbl_xpm1b_status.address) { - status = acpi_write(ACPI_BITMASK_ALL_FIXED_STATUS, - &acpi_gbl_xpm1b_status); - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; - } - } - /* Clear the GPE Bits in all GPE registers in all GPE blocks */ status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); -- cgit v1.2.3-59-g8ed1b From aefc7f9a0220a40beff9b6b3b320cbeae128d0e3 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:26:02 +0800 Subject: ACPICA: For PM1B registers, do not shift value read or written The PM1B registers are mirrors of the PM1A registers with different bits actually implemented. From the ACPI specification: "Although the bits can be split between the two register blocks (each register block has a unique pointer within the FADT), the bit positions are maintained. The register block with unimplemented bits (that is, those implemented in the other register block) always returns zeros, and writes have no side effects" Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwregs.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 9c8162128c2b..5a64e577975f 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -372,9 +372,17 @@ acpi_hw_read_multiple(u32 *value, } } - /* Shift the B bits above the A bits */ - - *value = value_a | (value_b << register_a->bit_width); + /* + * OR the two return values together. No shifting or masking is necessary, + * because of how the PM1 registers are defined in the ACPI specification: + * + * "Although the bits can be split between the two register blocks (each + * register block has a unique pointer within the FADT), the bit positions + * are maintained. The register block with unimplemented bits (that is, + * those implemented in the other register block) always returns zeros, + * and writes have no side effects" + */ + *value = (value_a | value_b); return (AE_OK); } @@ -406,13 +414,20 @@ acpi_hw_write_multiple(u32 value, return (status); } - /* Second register is optional */ - + /* + * Second register is optional + * + * No bit shifting or clearing is necessary, because of how the PM1 + * registers are defined in the ACPI specification: + * + * "Although the bits can be split between the two register blocks (each + * register block has a unique pointer within the FADT), the bit positions + * are maintained. The register block with unimplemented bits (that is, + * those implemented in the other register block) always returns zeros, + * and writes have no side effects" + */ if (register_b->address) { - - /* Normalize the B bits before write */ - - status = acpi_write(value >> register_a->bit_width, register_b); + status = acpi_write(value, register_b); } return (status); -- cgit v1.2.3-59-g8ed1b From ac0c84502697114a378057eed83a9baba879cfc9 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:28:02 +0800 Subject: ACPICA: Fix parameter validation for acpi_read/write Now return AE_BAD_PARAMETER if the input register pointer is null, and AE_BAD_ADDRESS if the register has an address of zero. Previously, these cases simply returned AE_OK. For optional registers such as PM1B status/enable/control, the caller should check for a valid register address before calling. ACPICA BZ 748. http://www.acpica.org/bugzilla/show_bug.cgi?id=748 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwxface.c | 18 ++++++++---------- include/acpi/acexcep.h | 6 ++++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index ae597c0ab53f..f67562ea0010 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -107,19 +107,18 @@ acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg) ACPI_FUNCTION_NAME(acpi_read); /* - * Must have a valid pointer to a GAS structure, and - * a non-zero address within. However, don't return an error - * because the PM1A/B code must not fail if B isn't present. + * Must have a valid pointer to a GAS structure, and a non-zero address + * within. */ if (!reg) { - return (AE_OK); + return (AE_BAD_PARAMETER); } /* Get a local copy of the address. Handles possible alignment issues */ ACPI_MOVE_64_TO_64(&address, ®->address); if (!address) { - return (AE_OK); + return (AE_BAD_ADDRESS); } /* Supported widths are 8/16/32 */ @@ -187,19 +186,18 @@ acpi_status acpi_write(u32 value, struct acpi_generic_address *reg) ACPI_FUNCTION_NAME(acpi_write); /* - * Must have a valid pointer to a GAS structure, and - * a non-zero address within. However, don't return an error - * because the PM1A/B code must not fail if B isn't present. + * Must have a valid pointer to a GAS structure, and a non-zero address + * within. */ if (!reg) { - return (AE_OK); + return (AE_BAD_PARAMETER); } /* Get a local copy of the address. Handles possible alignment issues */ ACPI_MOVE_64_TO_64(&address, ®->address); if (!address) { - return (AE_OK); + return (AE_BAD_ADDRESS); } /* Supported widths are 8/16/32 */ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index eda04546cdf6..473d584b1d31 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -103,8 +103,9 @@ #define AE_BAD_OCTAL_CONSTANT (acpi_status) (0x0006 | AE_CODE_PROGRAMMER) #define AE_BAD_DECIMAL_CONSTANT (acpi_status) (0x0007 | AE_CODE_PROGRAMMER) #define AE_MISSING_ARGUMENTS (acpi_status) (0x0008 | AE_CODE_PROGRAMMER) +#define AE_BAD_ADDRESS (acpi_status) (0x0009 | AE_CODE_PROGRAMMER) -#define AE_CODE_PGM_MAX 0x0008 +#define AE_CODE_PGM_MAX 0x0009 /* * Acpi table exceptions @@ -224,7 +225,8 @@ char const *acpi_gbl_exception_names_pgm[] = { "AE_BAD_HEX_CONSTANT", "AE_BAD_OCTAL_CONSTANT", "AE_BAD_DECIMAL_CONSTANT", - "AE_MISSING_ARGUMENTS" + "AE_MISSING_ARGUMENTS", + "AE_BAD_ADDRESS" }; char const *acpi_gbl_exception_names_tbl[] = { -- cgit v1.2.3-59-g8ed1b From 82d79b86646504a0ab97fe50ac137df65f651a27 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:31:05 +0800 Subject: ACPICA: Remove redundant ACPI_BITREG_SLEEP_TYPE_B This type is the same as TYPE_A. Removed this and all related instances. Renamed SLEEP_TYPE_A to simply SLEEP_TYPE. ACPICA BZ 754. http://www.acpica.org/bugzilla/show_bug.cgi?id=754 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/aclocal.h | 4 ++-- drivers/acpi/acpica/hwsleep.c | 4 ++-- drivers/acpi/acpica/utglobal.c | 9 +++------ include/acpi/actypes.h | 9 ++++----- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 492d02761bb7..b9a0aa67ab10 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -818,7 +818,7 @@ struct acpi_bit_register_info { #define ACPI_BITMASK_SCI_ENABLE 0x0001 #define ACPI_BITMASK_BUS_MASTER_RLD 0x0002 #define ACPI_BITMASK_GLOBAL_LOCK_RELEASE 0x0004 -#define ACPI_BITMASK_SLEEP_TYPE_X 0x1C00 +#define ACPI_BITMASK_SLEEP_TYPE 0x1C00 #define ACPI_BITMASK_SLEEP_ENABLE 0x2000 #define ACPI_BITMASK_ARB_DISABLE 0x0001 @@ -844,7 +844,7 @@ struct acpi_bit_register_info { #define ACPI_BITPOSITION_SCI_ENABLE 0x00 #define ACPI_BITPOSITION_BUS_MASTER_RLD 0x01 #define ACPI_BITPOSITION_GLOBAL_LOCK_RELEASE 0x02 -#define ACPI_BITPOSITION_SLEEP_TYPE_X 0x0A +#define ACPI_BITPOSITION_SLEEP_TYPE 0x0A #define ACPI_BITPOSITION_SLEEP_ENABLE 0x0D #define ACPI_BITPOSITION_ARB_DISABLE 0x00 diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index a2af2a4f2f26..1a7d260f7826 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -242,7 +242,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) } sleep_type_reg_info = - acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_TYPE_A); + acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_TYPE); sleep_enable_reg_info = acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_ENABLE); @@ -486,7 +486,7 @@ acpi_status acpi_leave_sleep_state_prep(u8 sleep_state) &acpi_gbl_sleep_type_b); if (ACPI_SUCCESS(status)) { sleep_type_reg_info = - acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_TYPE_A); + acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_TYPE); sleep_enable_reg_info = acpi_hw_get_bit_register_info(ACPI_BITREG_SLEEP_ENABLE); diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index a3ab9d9da299..bec0e21673cb 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -294,12 +294,9 @@ struct acpi_bit_register_info acpi_gbl_bit_register_info[ACPI_NUM_BITREG] = { /* ACPI_BITREG_GLOBAL_LOCK_RELEASE */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_GLOBAL_LOCK_RELEASE, ACPI_BITMASK_GLOBAL_LOCK_RELEASE}, - /* ACPI_BITREG_SLEEP_TYPE_A */ {ACPI_REGISTER_PM1_CONTROL, - ACPI_BITPOSITION_SLEEP_TYPE_X, - ACPI_BITMASK_SLEEP_TYPE_X}, - /* ACPI_BITREG_SLEEP_TYPE_B */ {ACPI_REGISTER_PM1_CONTROL, - ACPI_BITPOSITION_SLEEP_TYPE_X, - ACPI_BITMASK_SLEEP_TYPE_X}, + /* ACPI_BITREG_SLEEP_TYPE */ {ACPI_REGISTER_PM1_CONTROL, + ACPI_BITPOSITION_SLEEP_TYPE, + ACPI_BITMASK_SLEEP_TYPE}, /* ACPI_BITREG_SLEEP_ENABLE */ {ACPI_REGISTER_PM1_CONTROL, ACPI_BITPOSITION_SLEEP_ENABLE, ACPI_BITMASK_SLEEP_ENABLE}, diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index a20aab510173..1b9601c665e6 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -777,15 +777,14 @@ typedef u8 acpi_adr_space_type; #define ACPI_BITREG_SCI_ENABLE 0x0E #define ACPI_BITREG_BUS_MASTER_RLD 0x0F #define ACPI_BITREG_GLOBAL_LOCK_RELEASE 0x10 -#define ACPI_BITREG_SLEEP_TYPE_A 0x11 -#define ACPI_BITREG_SLEEP_TYPE_B 0x12 -#define ACPI_BITREG_SLEEP_ENABLE 0x13 +#define ACPI_BITREG_SLEEP_TYPE 0x11 +#define ACPI_BITREG_SLEEP_ENABLE 0x12 /* PM2 Control register */ -#define ACPI_BITREG_ARB_DISABLE 0x14 +#define ACPI_BITREG_ARB_DISABLE 0x13 -#define ACPI_BITREG_MAX 0x14 +#define ACPI_BITREG_MAX 0x13 #define ACPI_NUM_BITREG ACPI_BITREG_MAX + 1 /* -- cgit v1.2.3-59-g8ed1b From 32c9ef994d91352b710b948ec369cd18d6bca51b Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:36:05 +0800 Subject: ACPICA: Add function to handle PM1 control registers Added acpi_hw_write_pm1_control. This function writes both of the PM1 control registers (A/B). These registers are different than than the PM1 A/B status and enable registers in that different values can be written to the A/B registers. Most notably, the SLP_TYP bits can be different, as per the values returned from the _Sx predefined methods. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/achware.h | 5 +-- drivers/acpi/acpica/aclocal.h | 10 +++--- drivers/acpi/acpica/hwregs.c | 46 ++++++++++++++++++------ drivers/acpi/acpica/hwsleep.c | 84 ++++++++++++++++++------------------------- 4 files changed, 78 insertions(+), 67 deletions(-) diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 58c69dc49ab4..4fa6ee6b1f7c 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -64,8 +64,9 @@ u32 acpi_hw_get_mode(void); */ struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id); -acpi_status -acpi_hw_register_read(u32 register_id, u32 * return_value); +acpi_status acpi_hw_write_pm1_control(u32 pm1a_control, u32 pm1b_control); + +acpi_status acpi_hw_register_read(u32 register_id, u32 *return_value); acpi_status acpi_hw_register_write(u32 register_id, u32 value); diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index b9a0aa67ab10..6feebc8f789a 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -781,12 +781,10 @@ struct acpi_bit_register_info { #define ACPI_REGISTER_PM1_STATUS 0x01 #define ACPI_REGISTER_PM1_ENABLE 0x02 #define ACPI_REGISTER_PM1_CONTROL 0x03 -#define ACPI_REGISTER_PM1A_CONTROL 0x04 -#define ACPI_REGISTER_PM1B_CONTROL 0x05 -#define ACPI_REGISTER_PM2_CONTROL 0x06 -#define ACPI_REGISTER_PM_TIMER 0x07 -#define ACPI_REGISTER_PROCESSOR_BLOCK 0x08 -#define ACPI_REGISTER_SMI_COMMAND_BLOCK 0x09 +#define ACPI_REGISTER_PM2_CONTROL 0x04 +#define ACPI_REGISTER_PM_TIMER 0x05 +#define ACPI_REGISTER_PROCESSOR_BLOCK 0x06 +#define ACPI_REGISTER_SMI_COMMAND_BLOCK 0x07 /* Masks used to access the bit_registers */ diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 5a64e577975f..edc627c9fc0d 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -129,6 +129,42 @@ struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id) return (&acpi_gbl_bit_register_info[register_id]); } +/****************************************************************************** + * + * FUNCTION: acpi_hw_write_pm1_control + * + * PARAMETERS: pm1a_control - Value to be written to PM1A control + * pm1b_control - Value to be written to PM1B control + * + * RETURN: Status + * + * DESCRIPTION: Write the PM1 A/B control registers. These registers are + * different than than the PM1 A/B status and enable registers + * in that different values can be written to the A/B registers. + * Most notably, the SLP_TYP bits can be different, as per the + * values returned from the _Sx predefined methods. + * + ******************************************************************************/ + +acpi_status acpi_hw_write_pm1_control(u32 pm1a_control, u32 pm1b_control) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(hw_write_pm1_control); + + status = acpi_write(pm1a_control, &acpi_gbl_FADT.xpm1a_control_block); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + if (acpi_gbl_FADT.xpm1b_control_block.address) { + status = + acpi_write(pm1b_control, + &acpi_gbl_FADT.xpm1b_control_block); + } + return_ACPI_STATUS(status); +} + /****************************************************************************** * * FUNCTION: acpi_hw_register_read @@ -295,16 +331,6 @@ acpi_status acpi_hw_register_write(u32 register_id, u32 value) xpm1b_control_block); break; - case ACPI_REGISTER_PM1A_CONTROL: /* 16-bit access */ - - status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block); - break; - - case ACPI_REGISTER_PM1B_CONTROL: /* 16-bit access */ - - status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block); - break; - case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ status = acpi_write(value, &acpi_gbl_FADT.xpm2_control_block); diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 1a7d260f7826..51868f48159c 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -147,9 +147,8 @@ acpi_status acpi_enter_sleep_state_prep(u8 sleep_state) ACPI_FUNCTION_TRACE(acpi_enter_sleep_state_prep); - /* - * _PSW methods could be run here to enable wake-on keyboard, LAN, etc. - */ + /* _PSW methods could be run here to enable wake-on keyboard, LAN, etc. */ + status = acpi_get_sleep_type_data(sleep_state, &acpi_gbl_sleep_type_a, &acpi_gbl_sleep_type_b); @@ -223,8 +222,8 @@ ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_prep) ******************************************************************************/ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) { - u32 PM1Acontrol; - u32 PM1Bcontrol; + u32 pm1a_control; + u32 pm1b_control; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; u32 in_value; @@ -289,24 +288,25 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) /* Get current value of PM1A control */ - status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, &PM1Acontrol); + status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, + &pm1a_control); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } ACPI_DEBUG_PRINT((ACPI_DB_INIT, "Entering sleep state [S%d]\n", sleep_state)); - /* Clear SLP_EN and SLP_TYP fields */ + /* Clear the SLP_EN and SLP_TYP fields */ - PM1Acontrol &= ~(sleep_type_reg_info->access_bit_mask | - sleep_enable_reg_info->access_bit_mask); - PM1Bcontrol = PM1Acontrol; + pm1a_control &= ~(sleep_type_reg_info->access_bit_mask | + sleep_enable_reg_info->access_bit_mask); + pm1b_control = pm1a_control; - /* Insert SLP_TYP bits */ + /* Insert the SLP_TYP bits */ - PM1Acontrol |= + pm1a_control |= (acpi_gbl_sleep_type_a << sleep_type_reg_info->bit_position); - PM1Bcontrol |= + pm1b_control |= (acpi_gbl_sleep_type_b << sleep_type_reg_info->bit_position); /* @@ -314,37 +314,25 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) * poorly implemented hardware. */ - /* Write #1: fill in SLP_TYP data */ - - status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, - PM1Acontrol); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } + /* Write #1: write the SLP_TYP data to the PM1 Control registers */ - status = acpi_hw_register_write(ACPI_REGISTER_PM1B_CONTROL, - PM1Bcontrol); + status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } - /* Insert SLP_ENABLE bit */ + /* Insert the sleep enable (SLP_EN) bit */ - PM1Acontrol |= sleep_enable_reg_info->access_bit_mask; - PM1Bcontrol |= sleep_enable_reg_info->access_bit_mask; + pm1a_control |= sleep_enable_reg_info->access_bit_mask; + pm1b_control |= sleep_enable_reg_info->access_bit_mask; - /* Write #2: SLP_TYP + SLP_EN */ + /* Flush caches, as per ACPI specification */ ACPI_FLUSH_CPU_CACHE(); - status = acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, - PM1Acontrol); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } + /* Write #2: Write both SLP_TYP + SLP_EN */ - status = acpi_hw_register_write(ACPI_REGISTER_PM1B_CONTROL, - PM1Bcontrol); + status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -471,8 +459,8 @@ acpi_status acpi_leave_sleep_state_prep(u8 sleep_state) acpi_status status; struct acpi_bit_register_info *sleep_type_reg_info; struct acpi_bit_register_info *sleep_enable_reg_info; - u32 PM1Acontrol; - u32 PM1Bcontrol; + u32 pm1a_control; + u32 pm1b_control; ACPI_FUNCTION_TRACE(acpi_leave_sleep_state_prep); @@ -493,31 +481,29 @@ acpi_status acpi_leave_sleep_state_prep(u8 sleep_state) /* Get current value of PM1A control */ status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, - &PM1Acontrol); + &pm1a_control); if (ACPI_SUCCESS(status)) { - /* Clear SLP_EN and SLP_TYP fields */ + /* Clear the SLP_EN and SLP_TYP fields */ - PM1Acontrol &= ~(sleep_type_reg_info->access_bit_mask | - sleep_enable_reg_info-> - access_bit_mask); - PM1Bcontrol = PM1Acontrol; + pm1a_control &= ~(sleep_type_reg_info->access_bit_mask | + sleep_enable_reg_info-> + access_bit_mask); + pm1b_control = pm1a_control; - /* Insert SLP_TYP bits */ + /* Insert the SLP_TYP bits */ - PM1Acontrol |= + pm1a_control |= (acpi_gbl_sleep_type_a << sleep_type_reg_info-> bit_position); - PM1Bcontrol |= + pm1b_control |= (acpi_gbl_sleep_type_b << sleep_type_reg_info-> bit_position); - /* Just ignore any errors */ + /* Write the control registers and ignore any errors */ - (void)acpi_hw_register_write(ACPI_REGISTER_PM1A_CONTROL, - PM1Acontrol); - (void)acpi_hw_register_write(ACPI_REGISTER_PM1B_CONTROL, - PM1Bcontrol); + (void)acpi_hw_write_pm1_control(pm1a_control, + pm1b_control); } } -- cgit v1.2.3-59-g8ed1b From 3371c19c294a4cb3649aa4e84606be8a1d999e61 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:44:03 +0800 Subject: ACPICA: Remove ACPI_GET_OBJECT_TYPE macro Remove all instances of this obsolete macro, since it is now a simple reference to ->common.type. There were about 150 invocations of the macro across 41 files. ACPICA BZ 755. http://www.acpica.org/bugzilla/show_bug.cgi?id=755 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acmacros.h | 4 ---- drivers/acpi/acpica/dsmthdat.c | 2 +- drivers/acpi/acpica/dsobject.c | 6 +++--- drivers/acpi/acpica/dsopcode.c | 8 +++----- drivers/acpi/acpica/dsutils.c | 2 +- drivers/acpi/acpica/dswexec.c | 5 ++--- drivers/acpi/acpica/evgpeblk.c | 8 ++++---- drivers/acpi/acpica/evregion.c | 2 +- drivers/acpi/acpica/exconfig.c | 4 ++-- drivers/acpi/acpica/exconvrt.c | 11 +++++------ drivers/acpi/acpica/exdump.c | 19 ++++++++----------- drivers/acpi/acpica/exfield.c | 29 +++++++++++++---------------- drivers/acpi/acpica/exfldio.c | 8 ++++---- drivers/acpi/acpica/exmisc.c | 14 +++++++------- drivers/acpi/acpica/exoparg1.c | 13 +++++-------- drivers/acpi/acpica/exoparg2.c | 2 +- drivers/acpi/acpica/exoparg3.c | 7 +++---- drivers/acpi/acpica/exprep.c | 2 +- drivers/acpi/acpica/exresnte.c | 8 ++++---- drivers/acpi/acpica/exresolv.c | 9 ++++----- drivers/acpi/acpica/exresop.c | 23 +++++++++++------------ drivers/acpi/acpica/exstore.c | 18 ++++++++---------- drivers/acpi/acpica/exstoren.c | 29 ++++++++++++----------------- drivers/acpi/acpica/exutils.c | 2 +- drivers/acpi/acpica/hwxface.c | 6 +++--- drivers/acpi/acpica/nsaccess.c | 3 +-- drivers/acpi/acpica/nsdump.c | 2 +- drivers/acpi/acpica/nsobject.c | 18 ++++++++---------- drivers/acpi/acpica/nspredef.c | 8 ++++---- drivers/acpi/acpica/nsxfeval.c | 3 +-- drivers/acpi/acpica/rscalc.c | 7 +++---- drivers/acpi/acpica/rscreate.c | 15 +++++++-------- drivers/acpi/acpica/utcopy.c | 25 ++++++++++++------------- drivers/acpi/acpica/utdelete.c | 6 +++--- drivers/acpi/acpica/uteval.c | 12 ++++++------ drivers/acpi/acpica/utglobal.c | 2 +- drivers/acpi/acpica/utmisc.c | 3 +-- drivers/acpi/acpica/utobject.c | 7 +++---- 38 files changed, 158 insertions(+), 194 deletions(-) diff --git a/drivers/acpi/acpica/acmacros.h b/drivers/acpi/acpica/acmacros.h index 9c127e8e2d6d..91ac7d7b4402 100644 --- a/drivers/acpi/acpica/acmacros.h +++ b/drivers/acpi/acpica/acmacros.h @@ -292,10 +292,6 @@ #define ACPI_GET_DESCRIPTOR_TYPE(d) (((union acpi_descriptor *)(void *)(d))->common.descriptor_type) #define ACPI_SET_DESCRIPTOR_TYPE(d, t) (((union acpi_descriptor *)(void *)(d))->common.descriptor_type = t) -/* Macro to test the object type */ - -#define ACPI_GET_OBJECT_TYPE(d) (((union acpi_operand_object *)(void *)(d))->common.type) - /* * Macros for the master AML opcode table */ diff --git a/drivers/acpi/acpica/dsmthdat.c b/drivers/acpi/acpica/dsmthdat.c index da0f5468184c..22b1a3ce2c94 100644 --- a/drivers/acpi/acpica/dsmthdat.c +++ b/drivers/acpi/acpica/dsmthdat.c @@ -713,6 +713,6 @@ acpi_ds_method_data_get_type(u16 opcode, /* Get the object type */ - return_VALUE(ACPI_GET_OBJECT_TYPE(object)); + return_VALUE(object->type); } #endif diff --git a/drivers/acpi/acpica/dsobject.c b/drivers/acpi/acpica/dsobject.c index 15c628e6aa00..dab3f48f0b42 100644 --- a/drivers/acpi/acpica/dsobject.c +++ b/drivers/acpi/acpica/dsobject.c @@ -565,7 +565,7 @@ acpi_ds_create_node(struct acpi_walk_state *walk_state, /* Re-type the object according to its argument */ - node->type = ACPI_GET_OBJECT_TYPE(obj_desc); + node->type = obj_desc->common.type; /* Attach obj to node */ @@ -619,7 +619,7 @@ acpi_ds_init_object_from_op(struct acpi_walk_state *walk_state, /* Perform per-object initialization */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER: /* @@ -803,7 +803,7 @@ acpi_ds_init_object_from_op(struct acpi_walk_state *walk_state, default: ACPI_ERROR((AE_INFO, "Unimplemented data type: %X", - ACPI_GET_OBJECT_TYPE(obj_desc))); + obj_desc->common.type)); status = AE_AML_OPERAND_TYPE; break; diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index 0c3b4dd60e8a..602ddaa10c29 100644 --- a/drivers/acpi/acpica/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -484,7 +484,7 @@ acpi_ds_init_buffer_field(u16 aml_opcode, /* Host object must be a Buffer */ - if (ACPI_GET_OBJECT_TYPE(buffer_desc) != ACPI_TYPE_BUFFER) { + if (buffer_desc->common.type != ACPI_TYPE_BUFFER) { ACPI_ERROR((AE_INFO, "Target of Create Field is not a Buffer object - %s", acpi_ut_get_object_type_name(buffer_desc))); @@ -1365,10 +1365,8 @@ acpi_ds_exec_end_control_op(struct acpi_walk_state * walk_state, if ((ACPI_GET_DESCRIPTOR_TYPE (walk_state->results->results.obj_desc[0]) == ACPI_DESC_TYPE_OPERAND) - && - (ACPI_GET_OBJECT_TYPE - (walk_state->results->results.obj_desc[0]) == - ACPI_TYPE_LOCAL_REFERENCE) + && ((walk_state->results->results.obj_desc[0])-> + common.type == ACPI_TYPE_LOCAL_REFERENCE) && ((walk_state->results->results.obj_desc[0])-> reference.class != ACPI_REFCLASS_INDEX)) { status = diff --git a/drivers/acpi/acpica/dsutils.c b/drivers/acpi/acpica/dsutils.c index dabc23a46176..dfa104102926 100644 --- a/drivers/acpi/acpica/dsutils.c +++ b/drivers/acpi/acpica/dsutils.c @@ -816,7 +816,7 @@ acpi_status acpi_ds_evaluate_name_path(struct acpi_walk_state *walk_state) goto push_result; } - type = ACPI_GET_OBJECT_TYPE(*operand); + type = (*operand)->common.type; status = acpi_ex_resolve_to_value(operand, walk_state); if (ACPI_FAILURE(status)) { diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c index 350e6656bc89..f0280856dc0e 100644 --- a/drivers/acpi/acpica/dswexec.c +++ b/drivers/acpi/acpica/dswexec.c @@ -138,11 +138,10 @@ acpi_ds_get_predicate_value(struct acpi_walk_state *walk_state, goto cleanup; } - if (ACPI_GET_OBJECT_TYPE(local_obj_desc) != ACPI_TYPE_INTEGER) { + if (local_obj_desc->common.type != ACPI_TYPE_INTEGER) { ACPI_ERROR((AE_INFO, "Bad predicate (not an integer) ObjDesc=%p State=%p Type=%X", - obj_desc, walk_state, - ACPI_GET_OBJECT_TYPE(obj_desc))); + obj_desc, walk_state, obj_desc->common.type)); status = AE_AML_OPERAND_TYPE; goto cleanup; diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c index 484cc0565d5b..f7b3d2af9401 100644 --- a/drivers/acpi/acpica/evgpeblk.c +++ b/drivers/acpi/acpica/evgpeblk.c @@ -408,7 +408,7 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, */ obj_desc = pkg_desc->package.elements[0]; - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_INTEGER) { + if (obj_desc->common.type == ACPI_TYPE_INTEGER) { /* Use FADT-defined GPE device (from definition of _PRW) */ @@ -417,14 +417,14 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, /* Integer is the GPE number in the FADT described GPE blocks */ gpe_number = (u32) obj_desc->integer.value; - } else if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_PACKAGE) { + } else if (obj_desc->common.type == ACPI_TYPE_PACKAGE) { /* Package contains a GPE reference and GPE number within a GPE block */ if ((obj_desc->package.count < 2) || - (ACPI_GET_OBJECT_TYPE(obj_desc->package.elements[0]) != + ((obj_desc->package.elements[0])->common.type != ACPI_TYPE_LOCAL_REFERENCE) - || (ACPI_GET_OBJECT_TYPE(obj_desc->package.elements[1]) != + || ((obj_desc->package.elements[1])->common.type != ACPI_TYPE_INTEGER)) { goto cleanup; } diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index 665c0887ab4d..86cbbdbfc5cd 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -691,7 +691,7 @@ acpi_ev_install_handler(acpi_handle obj_handle, /* Devices are handled different than regions */ - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_DEVICE) { + if (obj_desc->common.type == ACPI_TYPE_DEVICE) { /* Check if this Device already has a handler for this address space */ diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c index 932bbc26aa04..70b39c7daeab 100644 --- a/drivers/acpi/acpica/exconfig.c +++ b/drivers/acpi/acpica/exconfig.c @@ -291,7 +291,7 @@ acpi_ex_load_op(union acpi_operand_object *obj_desc, /* Source Object can be either an op_region or a Buffer/Field */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_REGION: ACPI_DEBUG_PRINT((ACPI_DB_EXEC, @@ -501,7 +501,7 @@ acpi_status acpi_ex_unload_table(union acpi_operand_object *ddb_handle) */ if ((!ddb_handle) || (ACPI_GET_DESCRIPTOR_TYPE(ddb_handle) != ACPI_DESC_TYPE_OPERAND) || - (ACPI_GET_OBJECT_TYPE(ddb_handle) != ACPI_TYPE_LOCAL_REFERENCE)) { + (ddb_handle->common.type != ACPI_TYPE_LOCAL_REFERENCE)) { return_ACPI_STATUS(AE_BAD_PARAMETER); } diff --git a/drivers/acpi/acpica/exconvrt.c b/drivers/acpi/acpica/exconvrt.c index 0be10188316e..37d0d39e60a6 100644 --- a/drivers/acpi/acpica/exconvrt.c +++ b/drivers/acpi/acpica/exconvrt.c @@ -82,7 +82,7 @@ acpi_ex_convert_to_integer(union acpi_operand_object *obj_desc, ACPI_FUNCTION_TRACE_PTR(ex_convert_to_integer, obj_desc); - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: /* No conversion necessary */ @@ -116,7 +116,7 @@ acpi_ex_convert_to_integer(union acpi_operand_object *obj_desc, /* String conversion is different than Buffer conversion */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_STRING: /* @@ -206,7 +206,7 @@ acpi_ex_convert_to_buffer(union acpi_operand_object *obj_desc, ACPI_FUNCTION_TRACE_PTR(ex_convert_to_buffer, obj_desc); - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER: /* No conversion necessary */ @@ -409,7 +409,7 @@ acpi_ex_convert_to_string(union acpi_operand_object * obj_desc, ACPI_FUNCTION_TRACE_PTR(ex_convert_to_string, obj_desc); - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_STRING: /* No conversion necessary */ @@ -605,8 +605,7 @@ acpi_ex_convert_to_target_type(acpi_object_type destination_type, default: /* No conversion allowed for these types */ - if (destination_type != - ACPI_GET_OBJECT_TYPE(source_desc)) { + if (destination_type != source_desc->common.type) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Explicit operator, will store (%s) over existing type (%s)\n", acpi_ut_get_object_type_name diff --git a/drivers/acpi/acpica/exdump.c b/drivers/acpi/acpica/exdump.c index aa313574b0df..193d23312e18 100644 --- a/drivers/acpi/acpica/exdump.c +++ b/drivers/acpi/acpica/exdump.c @@ -492,7 +492,7 @@ void acpi_ex_dump_operand(union acpi_operand_object *obj_desc, u32 depth) /* Decode object type */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_LOCAL_REFERENCE: acpi_os_printf("Reference: [%s] ", @@ -531,7 +531,7 @@ void acpi_ex_dump_operand(union acpi_operand_object *obj_desc, u32 depth) acpi_os_printf("%X", obj_desc->reference.value); - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_INTEGER) { + if (obj_desc->common.type == ACPI_TYPE_INTEGER) { /* Value is an Integer */ @@ -548,7 +548,7 @@ void acpi_ex_dump_operand(union acpi_operand_object *obj_desc, u32 depth) acpi_os_printf("%X", obj_desc->reference.value); - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_INTEGER) { + if (obj_desc->common.type == ACPI_TYPE_INTEGER) { /* Value is an Integer */ @@ -686,9 +686,8 @@ void acpi_ex_dump_operand(union acpi_operand_object *obj_desc, u32 depth) if (!obj_desc->buffer_field.buffer_obj) { ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "*NULL*\n")); - } else - if (ACPI_GET_OBJECT_TYPE(obj_desc->buffer_field.buffer_obj) - != ACPI_TYPE_BUFFER) { + } else if ((obj_desc->buffer_field.buffer_obj)->common.type != + ACPI_TYPE_BUFFER) { acpi_os_printf("*not a Buffer*\n"); } else { acpi_ex_dump_operand(obj_desc->buffer_field.buffer_obj, @@ -737,8 +736,7 @@ void acpi_ex_dump_operand(union acpi_operand_object *obj_desc, u32 depth) default: /* Unknown Type */ - acpi_os_printf("Unknown Type %X\n", - ACPI_GET_OBJECT_TYPE(obj_desc)); + acpi_os_printf("Unknown Type %X\n", obj_desc->common.type); break; } @@ -939,7 +937,7 @@ acpi_ex_dump_package_obj(union acpi_operand_object *obj_desc, /* Packages may only contain a few object types */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: acpi_os_printf("[Integer] = %8.8X%8.8X\n", @@ -990,8 +988,7 @@ acpi_ex_dump_package_obj(union acpi_operand_object *obj_desc, default: - acpi_os_printf("[Unknown Type] %X\n", - ACPI_GET_OBJECT_TYPE(obj_desc)); + acpi_os_printf("[Unknown Type] %X\n", obj_desc->common.type); break; } } diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c index a352d0233857..546dcdd86785 100644 --- a/drivers/acpi/acpica/exfield.c +++ b/drivers/acpi/acpica/exfield.c @@ -84,7 +84,7 @@ acpi_ex_read_data_from_field(struct acpi_walk_state *walk_state, return_ACPI_STATUS(AE_BAD_PARAMETER); } - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_BUFFER_FIELD) { + if (obj_desc->common.type == ACPI_TYPE_BUFFER_FIELD) { /* * If the buffer_field arguments have not been previously evaluated, * evaluate them now and save the results. @@ -95,9 +95,8 @@ acpi_ex_read_data_from_field(struct acpi_walk_state *walk_state, return_ACPI_STATUS(status); } } - } else - if ((ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_REGION_FIELD) - && (obj_desc->field.region_obj->region.space_id == + } else if ((obj_desc->common.type == ACPI_TYPE_LOCAL_REGION_FIELD) && + (obj_desc->field.region_obj->region.space_id == ACPI_ADR_SPACE_SMBUS)) { /* * This is an SMBus read. We must create a buffer to hold the data @@ -163,7 +162,7 @@ acpi_ex_read_data_from_field(struct acpi_walk_state *walk_state, ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "FieldRead [TO]: Obj %p, Type %X, Buf %p, ByteLen %X\n", - obj_desc, ACPI_GET_OBJECT_TYPE(obj_desc), buffer, + obj_desc, obj_desc->common.type, buffer, (u32) length)); ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "FieldRead [FROM]: BitLen %X, BitOff %X, ByteOff %X\n", @@ -222,7 +221,7 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc, return_ACPI_STATUS(AE_AML_NO_OPERAND); } - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_BUFFER_FIELD) { + if (obj_desc->common.type == ACPI_TYPE_BUFFER_FIELD) { /* * If the buffer_field arguments have not been previously evaluated, * evaluate them now and save the results. @@ -233,9 +232,8 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc, return_ACPI_STATUS(status); } } - } else - if ((ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_REGION_FIELD) - && (obj_desc->field.region_obj->region.space_id == + } else if ((obj_desc->common.type == ACPI_TYPE_LOCAL_REGION_FIELD) && + (obj_desc->field.region_obj->region.space_id == ACPI_ADR_SPACE_SMBUS)) { /* * This is an SMBus write. We will bypass the entire field mechanism @@ -243,7 +241,7 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc, * * Source must be a buffer of sufficient size (ACPI_SMBUS_BUFFER_SIZE). */ - if (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_BUFFER) { + if (source_desc->common.type != ACPI_TYPE_BUFFER) { ACPI_ERROR((AE_INFO, "SMBus write requires Buffer, found type %s", acpi_ut_get_object_type_name(source_desc))); @@ -291,7 +289,7 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc, /* Get a pointer to the data to be written */ - switch (ACPI_GET_OBJECT_TYPE(source_desc)) { + switch (source_desc->common.type) { case ACPI_TYPE_INTEGER: buffer = &source_desc->integer.value; length = sizeof(source_desc->integer.value); @@ -314,15 +312,14 @@ acpi_ex_write_data_to_field(union acpi_operand_object *source_desc, ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "FieldWrite [FROM]: Obj %p (%s:%X), Buf %p, ByteLen %X\n", source_desc, - acpi_ut_get_type_name(ACPI_GET_OBJECT_TYPE - (source_desc)), - ACPI_GET_OBJECT_TYPE(source_desc), buffer, length)); + acpi_ut_get_type_name(source_desc->common.type), + source_desc->common.type, buffer, length)); ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, "FieldWrite [TO]: Obj %p (%s:%X), BitLen %X, BitOff %X, ByteOff %X\n", obj_desc, - acpi_ut_get_type_name(ACPI_GET_OBJECT_TYPE(obj_desc)), - ACPI_GET_OBJECT_TYPE(obj_desc), + acpi_ut_get_type_name(obj_desc->common.type), + obj_desc->common.type, obj_desc->common_field.bit_length, obj_desc->common_field.start_field_bit_offset, obj_desc->common_field.base_byte_offset)); diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index ef58ac4e687b..1053e7cd92a1 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -94,9 +94,9 @@ acpi_ex_setup_region(union acpi_operand_object *obj_desc, /* We must have a valid region */ - if (ACPI_GET_OBJECT_TYPE(rgn_desc) != ACPI_TYPE_REGION) { + if (rgn_desc->common.type != ACPI_TYPE_REGION) { ACPI_ERROR((AE_INFO, "Needed Region, found type %X (%s)", - ACPI_GET_OBJECT_TYPE(rgn_desc), + rgn_desc->common.type, acpi_ut_get_object_type_name(rgn_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); @@ -390,7 +390,7 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, * index_field - Write to an Index Register, then read/write from/to a * Data Register */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER_FIELD: /* * If the buffer_field arguments have not been previously evaluated, @@ -527,7 +527,7 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, default: ACPI_ERROR((AE_INFO, "Wrong object type in field I/O %X", - ACPI_GET_OBJECT_TYPE(obj_desc))); + obj_desc->common.type)); status = AE_AML_INTERNAL; break; } diff --git a/drivers/acpi/acpica/exmisc.c b/drivers/acpi/acpica/exmisc.c index 6b0747ac683b..998eac329937 100644 --- a/drivers/acpi/acpica/exmisc.c +++ b/drivers/acpi/acpica/exmisc.c @@ -80,7 +80,7 @@ acpi_ex_get_object_reference(union acpi_operand_object *obj_desc, switch (ACPI_GET_DESCRIPTOR_TYPE(obj_desc)) { case ACPI_DESC_TYPE_OPERAND: - if (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_LOCAL_REFERENCE) { + if (obj_desc->common.type != ACPI_TYPE_LOCAL_REFERENCE) { return_ACPI_STATUS(AE_AML_OPERAND_TYPE); } @@ -260,7 +260,7 @@ acpi_ex_do_concatenate(union acpi_operand_object *operand0, * guaranteed to be either Integer/String/Buffer by the operand * resolution mechanism. */ - switch (ACPI_GET_OBJECT_TYPE(operand0)) { + switch (operand0->common.type) { case ACPI_TYPE_INTEGER: status = acpi_ex_convert_to_integer(operand1, &local_operand1, 16); @@ -277,7 +277,7 @@ acpi_ex_do_concatenate(union acpi_operand_object *operand0, default: ACPI_ERROR((AE_INFO, "Invalid object type: %X", - ACPI_GET_OBJECT_TYPE(operand0))); + operand0->common.type)); status = AE_AML_INTERNAL; } @@ -298,7 +298,7 @@ acpi_ex_do_concatenate(union acpi_operand_object *operand0, * 2) Two Strings concatenated to produce a new String * 3) Two Buffers concatenated to produce a new Buffer */ - switch (ACPI_GET_OBJECT_TYPE(operand0)) { + switch (operand0->common.type) { case ACPI_TYPE_INTEGER: /* Result of two Integers is a Buffer */ @@ -379,7 +379,7 @@ acpi_ex_do_concatenate(union acpi_operand_object *operand0, /* Invalid object type, should not happen here */ ACPI_ERROR((AE_INFO, "Invalid object type: %X", - ACPI_GET_OBJECT_TYPE(operand0))); + operand0->common.type)); status = AE_AML_INTERNAL; goto cleanup; } @@ -581,7 +581,7 @@ acpi_ex_do_logical_op(u16 opcode, * guaranteed to be either Integer/String/Buffer by the operand * resolution mechanism. */ - switch (ACPI_GET_OBJECT_TYPE(operand0)) { + switch (operand0->common.type) { case ACPI_TYPE_INTEGER: status = acpi_ex_convert_to_integer(operand1, &local_operand1, 16); @@ -608,7 +608,7 @@ acpi_ex_do_logical_op(u16 opcode, /* * Two cases: 1) Both Integers, 2) Both Strings or Buffers */ - if (ACPI_GET_OBJECT_TYPE(operand0) == ACPI_TYPE_INTEGER) { + if (operand0->common.type == ACPI_TYPE_INTEGER) { /* * 1) Both operands are of type integer * Note: local_operand1 may have changed above diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index b530480cc7d5..9635d21e568d 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -807,11 +807,9 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) acpi_namespace_node *) operand[0]); if (temp_desc - && - ((ACPI_GET_OBJECT_TYPE(temp_desc) == - ACPI_TYPE_STRING) - || (ACPI_GET_OBJECT_TYPE(temp_desc) == - ACPI_TYPE_LOCAL_REFERENCE))) { + && ((temp_desc->common.type == ACPI_TYPE_STRING) + || (temp_desc->common.type == + ACPI_TYPE_LOCAL_REFERENCE))) { operand[0] = temp_desc; acpi_ut_add_reference(temp_desc); } else { @@ -819,7 +817,7 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) goto cleanup; } } else { - switch (ACPI_GET_OBJECT_TYPE(operand[0])) { + switch ((operand[0])->common.type) { case ACPI_TYPE_LOCAL_REFERENCE: /* * This is a deref_of (local_x | arg_x) @@ -877,8 +875,7 @@ acpi_status acpi_ex_opcode_1A_0T_1R(struct acpi_walk_state *walk_state) if (ACPI_GET_DESCRIPTOR_TYPE(operand[0]) != ACPI_DESC_TYPE_NAMED) { - if (ACPI_GET_OBJECT_TYPE(operand[0]) == - ACPI_TYPE_STRING) { + if ((operand[0])->common.type == ACPI_TYPE_STRING) { /* * This is a deref_of (String). The string is a reference * to a named ACPI object. diff --git a/drivers/acpi/acpica/exoparg2.c b/drivers/acpi/acpica/exoparg2.c index 0b4f513ca885..85d95c92dfd3 100644 --- a/drivers/acpi/acpica/exoparg2.c +++ b/drivers/acpi/acpica/exoparg2.c @@ -399,7 +399,7 @@ acpi_status acpi_ex_opcode_2A_1T_1R(struct acpi_walk_state *walk_state) * At this point, the Source operand is a String, Buffer, or Package. * Verify that the index is within range. */ - switch (ACPI_GET_OBJECT_TYPE(operand[0])) { + switch ((operand[0])->common.type) { case ACPI_TYPE_STRING: if (index >= operand[0]->string.length) { diff --git a/drivers/acpi/acpica/exoparg3.c b/drivers/acpi/acpica/exoparg3.c index c6520bbf882b..253f9e122584 100644 --- a/drivers/acpi/acpica/exoparg3.c +++ b/drivers/acpi/acpica/exoparg3.c @@ -161,9 +161,8 @@ acpi_status acpi_ex_opcode_3A_1T_1R(struct acpi_walk_state *walk_state) * Create the return object. The Source operand is guaranteed to be * either a String or a Buffer, so just use its type. */ - return_desc = - acpi_ut_create_internal_object(ACPI_GET_OBJECT_TYPE - (operand[0])); + return_desc = acpi_ut_create_internal_object((operand[0])-> + common.type); if (!return_desc) { status = AE_NO_MEMORY; goto cleanup; @@ -191,7 +190,7 @@ acpi_status acpi_ex_opcode_3A_1T_1R(struct acpi_walk_state *walk_state) /* Strings always have a sub-pointer, not so for buffers */ - switch (ACPI_GET_OBJECT_TYPE(operand[0])) { + switch ((operand[0])->common.type) { case ACPI_TYPE_STRING: /* Always allocate a new buffer for the String */ diff --git a/drivers/acpi/acpica/exprep.c b/drivers/acpi/acpica/exprep.c index a226f74d4a5c..52fec07064f0 100644 --- a/drivers/acpi/acpica/exprep.c +++ b/drivers/acpi/acpica/exprep.c @@ -279,7 +279,7 @@ acpi_ex_decode_field_access(union acpi_operand_object *obj_desc, return_UINT32(0); } - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_BUFFER_FIELD) { + if (obj_desc->common.type == ACPI_TYPE_BUFFER_FIELD) { /* * buffer_field access can be on any byte boundary, so the * byte_alignment is always 1 byte -- regardless of any byte_alignment diff --git a/drivers/acpi/acpica/exresnte.c b/drivers/acpi/acpica/exresnte.c index a063a74006f6..607958ff467c 100644 --- a/drivers/acpi/acpica/exresnte.c +++ b/drivers/acpi/acpica/exresnte.c @@ -136,7 +136,7 @@ acpi_ex_resolve_node_to_value(struct acpi_namespace_node **object_ptr, switch (entry_type) { case ACPI_TYPE_PACKAGE: - if (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_PACKAGE) { + if (source_desc->common.type != ACPI_TYPE_PACKAGE) { ACPI_ERROR((AE_INFO, "Object not a Package, type %s", acpi_ut_get_object_type_name(source_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); @@ -154,7 +154,7 @@ acpi_ex_resolve_node_to_value(struct acpi_namespace_node **object_ptr, case ACPI_TYPE_BUFFER: - if (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_BUFFER) { + if (source_desc->common.type != ACPI_TYPE_BUFFER) { ACPI_ERROR((AE_INFO, "Object not a Buffer, type %s", acpi_ut_get_object_type_name(source_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); @@ -172,7 +172,7 @@ acpi_ex_resolve_node_to_value(struct acpi_namespace_node **object_ptr, case ACPI_TYPE_STRING: - if (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_STRING) { + if (source_desc->common.type != ACPI_TYPE_STRING) { ACPI_ERROR((AE_INFO, "Object not a String, type %s", acpi_ut_get_object_type_name(source_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); @@ -186,7 +186,7 @@ acpi_ex_resolve_node_to_value(struct acpi_namespace_node **object_ptr, case ACPI_TYPE_INTEGER: - if (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_INTEGER) { + if (source_desc->common.type != ACPI_TYPE_INTEGER) { ACPI_ERROR((AE_INFO, "Object not a Integer, type %s", acpi_ut_get_object_type_name(source_desc))); return_ACPI_STATUS(AE_AML_OPERAND_TYPE); diff --git a/drivers/acpi/acpica/exresolv.c b/drivers/acpi/acpica/exresolv.c index f6105a6d6126..c93b54ce7f78 100644 --- a/drivers/acpi/acpica/exresolv.c +++ b/drivers/acpi/acpica/exresolv.c @@ -149,7 +149,7 @@ acpi_ex_resolve_object_to_value(union acpi_operand_object **stack_ptr, /* This is a union acpi_operand_object */ - switch (ACPI_GET_OBJECT_TYPE(stack_desc)) { + switch (stack_desc->common.type) { case ACPI_TYPE_LOCAL_REFERENCE: ref_type = stack_desc->reference.class; @@ -297,8 +297,7 @@ acpi_ex_resolve_object_to_value(union acpi_operand_object **stack_ptr, ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "FieldRead SourceDesc=%p Type=%X\n", - stack_desc, - ACPI_GET_OBJECT_TYPE(stack_desc))); + stack_desc, stack_desc->common.type)); status = acpi_ex_read_data_from_field(walk_state, stack_desc, @@ -386,7 +385,7 @@ acpi_ex_resolve_multiple(struct acpi_walk_state *walk_state, * specification of the object_type and size_of operators). This means * traversing the list of possibly many nested references. */ - while (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_REFERENCE) { + while (obj_desc->common.type == ACPI_TYPE_LOCAL_REFERENCE) { switch (obj_desc->reference.class) { case ACPI_REFCLASS_REFOF: case ACPI_REFCLASS_NAME: @@ -518,7 +517,7 @@ acpi_ex_resolve_multiple(struct acpi_walk_state *walk_state, * Now we are guaranteed to have an object that has not been created * via the ref_of or Index operators. */ - type = ACPI_GET_OBJECT_TYPE(obj_desc); + type = obj_desc->common.type; exit: /* Convert internal types to external types */ diff --git a/drivers/acpi/acpica/exresop.c b/drivers/acpi/acpica/exresop.c index 3c3802764bfb..5c729a9e9131 100644 --- a/drivers/acpi/acpica/exresop.c +++ b/drivers/acpi/acpica/exresop.c @@ -212,7 +212,7 @@ acpi_ex_resolve_operands(u16 opcode, /* ACPI internal object */ - object_type = ACPI_GET_OBJECT_TYPE(obj_desc); + object_type = obj_desc->common.type; /* Check for bad acpi_object_type */ @@ -287,8 +287,7 @@ acpi_ex_resolve_operands(u16 opcode, if ((ACPI_GET_DESCRIPTOR_TYPE(obj_desc) == ACPI_DESC_TYPE_OPERAND) - && (ACPI_GET_OBJECT_TYPE(obj_desc) == - ACPI_TYPE_STRING)) { + && (obj_desc->common.type == ACPI_TYPE_STRING)) { /* * String found - the string references a named object and * must be resolved to a node @@ -336,7 +335,7 @@ acpi_ex_resolve_operands(u16 opcode, * -- All others must be resolved below. */ if ((opcode == AML_STORE_OP) && - (ACPI_GET_OBJECT_TYPE(*stack_ptr) == + ((*stack_ptr)->common.type == ACPI_TYPE_LOCAL_REFERENCE) && ((*stack_ptr)->reference.class == ACPI_REFCLASS_INDEX)) { goto next_operand; @@ -490,7 +489,7 @@ acpi_ex_resolve_operands(u16 opcode, /* Need an operand of type INTEGER, STRING or BUFFER */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: @@ -512,7 +511,7 @@ acpi_ex_resolve_operands(u16 opcode, /* Need an operand of type STRING or BUFFER */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: @@ -553,7 +552,7 @@ acpi_ex_resolve_operands(u16 opcode, * The only reference allowed here is a direct reference to * a namespace node. */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_PACKAGE: case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: @@ -576,7 +575,7 @@ acpi_ex_resolve_operands(u16 opcode, /* Need a buffer or package or (ACPI 2.0) String */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_PACKAGE: case ACPI_TYPE_STRING: case ACPI_TYPE_BUFFER: @@ -598,7 +597,7 @@ acpi_ex_resolve_operands(u16 opcode, /* Need an operand of type REGION or a BUFFER (which could be a resolved region field) */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER: case ACPI_TYPE_REGION: @@ -619,7 +618,7 @@ acpi_ex_resolve_operands(u16 opcode, /* Used by the Store() operator only */ - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_PACKAGE: case ACPI_TYPE_STRING: @@ -677,8 +676,8 @@ acpi_ex_resolve_operands(u16 opcode, * required object type (Simple cases only). */ status = acpi_ex_check_object_type(type_needed, - ACPI_GET_OBJECT_TYPE - (*stack_ptr), *stack_ptr); + (*stack_ptr)->common.type, + *stack_ptr); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index e35e9b4f6a4e..90d606196c99 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -129,7 +129,7 @@ acpi_ex_do_debug_object(union acpi_operand_object *source_desc, /* source_desc is of type ACPI_DESC_TYPE_OPERAND */ - switch (ACPI_GET_OBJECT_TYPE(source_desc)) { + switch (source_desc->common.type) { case ACPI_TYPE_INTEGER: /* Output correct integer width */ @@ -324,7 +324,7 @@ acpi_ex_store(union acpi_operand_object *source_desc, /* Destination object must be a Reference or a Constant object */ - switch (ACPI_GET_OBJECT_TYPE(dest_desc)) { + switch (dest_desc->common.type) { case ACPI_TYPE_LOCAL_REFERENCE: break; @@ -460,9 +460,8 @@ acpi_ex_store_object_to_index(union acpi_operand_object *source_desc, */ obj_desc = *(index_desc->reference.where); - if (ACPI_GET_OBJECT_TYPE(source_desc) == - ACPI_TYPE_LOCAL_REFERENCE - && source_desc->reference.class == ACPI_REFCLASS_TABLE) { + if (source_desc->common.type == ACPI_TYPE_LOCAL_REFERENCE && + source_desc->reference.class == ACPI_REFCLASS_TABLE) { /* This is a DDBHandle, just add a reference to it */ @@ -520,8 +519,8 @@ acpi_ex_store_object_to_index(union acpi_operand_object *source_desc, * by the INDEX_OP code. */ obj_desc = index_desc->reference.object; - if ((ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_BUFFER) && - (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_STRING)) { + if ((obj_desc->common.type != ACPI_TYPE_BUFFER) && + (obj_desc->common.type != ACPI_TYPE_STRING)) { return_ACPI_STATUS(AE_AML_OPERAND_TYPE); } @@ -529,7 +528,7 @@ acpi_ex_store_object_to_index(union acpi_operand_object *source_desc, * The assignment of the individual elements will be slightly * different for each source type. */ - switch (ACPI_GET_OBJECT_TYPE(source_desc)) { + switch (source_desc->common.type) { case ACPI_TYPE_INTEGER: /* Use the least-significant byte of the integer */ @@ -707,8 +706,7 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, /* No conversions for all other types. Just attach the source object */ status = acpi_ns_attach_object(node, source_desc, - ACPI_GET_OBJECT_TYPE - (source_desc)); + source_desc->common.type); break; } diff --git a/drivers/acpi/acpica/exstoren.c b/drivers/acpi/acpica/exstoren.c index 145d15305f70..608e838d537e 100644 --- a/drivers/acpi/acpica/exstoren.c +++ b/drivers/acpi/acpica/exstoren.c @@ -96,8 +96,7 @@ acpi_ex_resolve_object(union acpi_operand_object **source_desc_ptr, * are all essentially the same. This case handles the * "interchangeable" types Integer, String, and Buffer. */ - if (ACPI_GET_OBJECT_TYPE(source_desc) == - ACPI_TYPE_LOCAL_REFERENCE) { + if (source_desc->common.type == ACPI_TYPE_LOCAL_REFERENCE) { /* Resolve a reference object first */ @@ -117,13 +116,11 @@ acpi_ex_resolve_object(union acpi_operand_object **source_desc_ptr, /* Must have a Integer, Buffer, or String */ - if ((ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_INTEGER) && - (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_BUFFER) && - (ACPI_GET_OBJECT_TYPE(source_desc) != ACPI_TYPE_STRING) && - !((ACPI_GET_OBJECT_TYPE(source_desc) == - ACPI_TYPE_LOCAL_REFERENCE) - && (source_desc->reference.class == - ACPI_REFCLASS_TABLE))) { + if ((source_desc->common.type != ACPI_TYPE_INTEGER) && + (source_desc->common.type != ACPI_TYPE_BUFFER) && + (source_desc->common.type != ACPI_TYPE_STRING) && + !((source_desc->common.type == ACPI_TYPE_LOCAL_REFERENCE) && + (source_desc->reference.class == ACPI_REFCLASS_TABLE))) { /* Conversion successful but still not a valid type */ @@ -218,8 +215,7 @@ acpi_ex_store_object_to_object(union acpi_operand_object *source_desc, return_ACPI_STATUS(status); } - if (ACPI_GET_OBJECT_TYPE(source_desc) != - ACPI_GET_OBJECT_TYPE(dest_desc)) { + if (source_desc->common.type != dest_desc->common.type) { /* * The source type does not match the type of the destination. * Perform the "implicit conversion" of the source to the current type @@ -229,11 +225,10 @@ acpi_ex_store_object_to_object(union acpi_operand_object *source_desc, * Otherwise, actual_src_desc is a temporary object to hold the * converted object. */ - status = - acpi_ex_convert_to_target_type(ACPI_GET_OBJECT_TYPE - (dest_desc), source_desc, - &actual_src_desc, - walk_state); + status = acpi_ex_convert_to_target_type(dest_desc->common.type, + source_desc, + &actual_src_desc, + walk_state); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -252,7 +247,7 @@ acpi_ex_store_object_to_object(union acpi_operand_object *source_desc, * We now have two objects of identical types, and we can perform a * copy of the *value* of the source object. */ - switch (ACPI_GET_OBJECT_TYPE(dest_desc)) { + switch (dest_desc->common.type) { case ACPI_TYPE_INTEGER: dest_desc->integer.value = actual_src_desc->integer.value; diff --git a/drivers/acpi/acpica/exutils.c b/drivers/acpi/acpica/exutils.c index 32b85d68e756..87730e944132 100644 --- a/drivers/acpi/acpica/exutils.c +++ b/drivers/acpi/acpica/exutils.c @@ -221,7 +221,7 @@ void acpi_ex_truncate_for32bit_table(union acpi_operand_object *obj_desc) */ if ((!obj_desc) || (ACPI_GET_DESCRIPTOR_TYPE(obj_desc) != ACPI_DESC_TYPE_OPERAND) || - (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_INTEGER)) { + (obj_desc->common.type != ACPI_TYPE_INTEGER)) { return; } diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index f67562ea0010..4df9eacb7c88 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -532,7 +532,7 @@ acpi_get_sleep_type_data(u8 sleep_state, u8 *sleep_type_a, u8 *sleep_type_b) /* It must be of type Package */ - else if (ACPI_GET_OBJECT_TYPE(info->return_object) != ACPI_TYPE_PACKAGE) { + else if (info->return_object->common.type != ACPI_TYPE_PACKAGE) { ACPI_ERROR((AE_INFO, "Sleep State return object is not a Package")); status = AE_AML_OPERAND_TYPE; @@ -553,9 +553,9 @@ acpi_get_sleep_type_data(u8 sleep_state, u8 *sleep_type_a, u8 *sleep_type_b) /* The first two elements must both be of type Integer */ - else if ((ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[0]) + else if (((info->return_object->package.elements[0])->common.type != ACPI_TYPE_INTEGER) || - (ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[1]) + ((info->return_object->package.elements[1])->common.type != ACPI_TYPE_INTEGER)) { ACPI_ERROR((AE_INFO, "Sleep State return package elements are not both Integers (%s, %s)", diff --git a/drivers/acpi/acpica/nsaccess.c b/drivers/acpi/acpica/nsaccess.c index 88303ebe924c..b6968a65cd4f 100644 --- a/drivers/acpi/acpica/nsaccess.c +++ b/drivers/acpi/acpica/nsaccess.c @@ -234,8 +234,7 @@ acpi_status acpi_ns_root_initialize(void) /* Store pointer to value descriptor in the Node */ status = acpi_ns_attach_object(new_node, obj_desc, - ACPI_GET_OBJECT_TYPE - (obj_desc)); + obj_desc->common.type); /* Remove local reference to the object */ diff --git a/drivers/acpi/acpica/nsdump.c b/drivers/acpi/acpica/nsdump.c index e96d37a596b8..7bfa6c1286f1 100644 --- a/drivers/acpi/acpica/nsdump.c +++ b/drivers/acpi/acpica/nsdump.c @@ -515,7 +515,7 @@ acpi_ns_dump_one_object(acpi_handle obj_handle, case ACPI_DESC_TYPE_OPERAND: - obj_type = ACPI_GET_OBJECT_TYPE(obj_desc); + obj_type = obj_desc->common.type; if (obj_type > ACPI_TYPE_LOCAL_MAX) { acpi_os_printf diff --git a/drivers/acpi/acpica/nsobject.c b/drivers/acpi/acpica/nsobject.c index 08a97a57f8f9..3eb20bfda9d8 100644 --- a/drivers/acpi/acpica/nsobject.c +++ b/drivers/acpi/acpica/nsobject.c @@ -209,8 +209,7 @@ void acpi_ns_detach_object(struct acpi_namespace_node *node) obj_desc = node->object; - if (!obj_desc || - (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_DATA)) { + if (!obj_desc || (obj_desc->common.type == ACPI_TYPE_LOCAL_DATA)) { return_VOID; } @@ -220,8 +219,7 @@ void acpi_ns_detach_object(struct acpi_namespace_node *node) if (ACPI_GET_DESCRIPTOR_TYPE(obj_desc) == ACPI_DESC_TYPE_OPERAND) { node->object = obj_desc->common.next_object; if (node->object && - (ACPI_GET_OBJECT_TYPE(node->object) != - ACPI_TYPE_LOCAL_DATA)) { + ((node->object)->common.type != ACPI_TYPE_LOCAL_DATA)) { node->object = node->object->common.next_object; } } @@ -267,7 +265,7 @@ union acpi_operand_object *acpi_ns_get_attached_object(struct ((ACPI_GET_DESCRIPTOR_TYPE(node->object) != ACPI_DESC_TYPE_OPERAND) && (ACPI_GET_DESCRIPTOR_TYPE(node->object) != ACPI_DESC_TYPE_NAMED)) - || (ACPI_GET_OBJECT_TYPE(node->object) == ACPI_TYPE_LOCAL_DATA)) { + || ((node->object)->common.type == ACPI_TYPE_LOCAL_DATA)) { return_PTR(NULL); } @@ -294,9 +292,9 @@ union acpi_operand_object *acpi_ns_get_secondary_object(union ACPI_FUNCTION_TRACE_PTR(ns_get_secondary_object, obj_desc); if ((!obj_desc) || - (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_DATA) || + (obj_desc->common.type == ACPI_TYPE_LOCAL_DATA) || (!obj_desc->common.next_object) || - (ACPI_GET_OBJECT_TYPE(obj_desc->common.next_object) == + ((obj_desc->common.next_object)->common.type == ACPI_TYPE_LOCAL_DATA)) { return_PTR(NULL); } @@ -331,7 +329,7 @@ acpi_ns_attach_data(struct acpi_namespace_node *node, prev_obj_desc = NULL; obj_desc = node->object; while (obj_desc) { - if ((ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_DATA) && + if ((obj_desc->common.type == ACPI_TYPE_LOCAL_DATA) && (obj_desc->data.handler == handler)) { return (AE_ALREADY_EXISTS); } @@ -385,7 +383,7 @@ acpi_ns_detach_data(struct acpi_namespace_node * node, prev_obj_desc = NULL; obj_desc = node->object; while (obj_desc) { - if ((ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_DATA) && + if ((obj_desc->common.type == ACPI_TYPE_LOCAL_DATA) && (obj_desc->data.handler == handler)) { if (prev_obj_desc) { prev_obj_desc->common.next_object = @@ -428,7 +426,7 @@ acpi_ns_get_attached_data(struct acpi_namespace_node * node, obj_desc = node->object; while (obj_desc) { - if ((ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_LOCAL_DATA) && + if ((obj_desc->common.type == ACPI_TYPE_LOCAL_DATA) && (obj_desc->data.handler == handler)) { *data = obj_desc->data.pointer; return (AE_OK); diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c index 452703290d35..72dd7b198520 100644 --- a/drivers/acpi/acpica/nspredef.c +++ b/drivers/acpi/acpica/nspredef.c @@ -221,7 +221,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, /* For returned Package objects, check the type of all sub-objects */ - if (ACPI_GET_OBJECT_TYPE(return_object) == ACPI_TYPE_PACKAGE) { + if (return_object->common.type == ACPI_TYPE_PACKAGE) { status = acpi_ns_check_package(pathname, return_object_ptr, predefined); @@ -858,7 +858,7 @@ acpi_ns_check_object_type(char *pathname, * from all of the predefined names (including elements of returned * packages) */ - switch (ACPI_GET_OBJECT_TYPE(return_object)) { + switch (return_object->common.type) { case ACPI_TYPE_INTEGER: return_btype = ACPI_RTYPE_INTEGER; break; @@ -901,7 +901,7 @@ acpi_ns_check_object_type(char *pathname, /* For reference objects, check that the reference type is correct */ - if (ACPI_GET_OBJECT_TYPE(return_object) == ACPI_TYPE_LOCAL_REFERENCE) { + if (return_object->common.type == ACPI_TYPE_LOCAL_REFERENCE) { status = acpi_ns_check_reference(pathname, return_object); } @@ -1006,7 +1006,7 @@ acpi_ns_repair_object(u32 expected_btypes, union acpi_operand_object *new_object; acpi_size length; - switch (ACPI_GET_OBJECT_TYPE(return_object)) { + switch (return_object->common.type) { case ACPI_TYPE_BUFFER: if (!(expected_btypes & ACPI_RTYPE_STRING)) { diff --git a/drivers/acpi/acpica/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c index 22a7171ac1ed..2583a66a60a7 100644 --- a/drivers/acpi/acpica/nsxfeval.c +++ b/drivers/acpi/acpica/nsxfeval.c @@ -387,8 +387,7 @@ static void acpi_ns_resolve_references(struct acpi_evaluate_info *info) /* We are interested in reference objects only */ - if (ACPI_GET_OBJECT_TYPE(info->return_object) != - ACPI_TYPE_LOCAL_REFERENCE) { + if ((info->return_object)->common.type != ACPI_TYPE_LOCAL_REFERENCE) { return; } diff --git a/drivers/acpi/acpica/rscalc.c b/drivers/acpi/acpica/rscalc.c index 52865ee6bc77..b6667ff059e5 100644 --- a/drivers/acpi/acpica/rscalc.c +++ b/drivers/acpi/acpica/rscalc.c @@ -557,9 +557,9 @@ acpi_rs_get_pci_routing_table_length(union acpi_operand_object *package_object, table_index++) { if (*sub_object_list && /* Null object allowed */ ((ACPI_TYPE_STRING == - ACPI_GET_OBJECT_TYPE(*sub_object_list)) || + (*sub_object_list)->common.type) || ((ACPI_TYPE_LOCAL_REFERENCE == - ACPI_GET_OBJECT_TYPE(*sub_object_list)) && + (*sub_object_list)->common.type) && ((*sub_object_list)->reference.class == ACPI_REFCLASS_NAME)))) { name_found = TRUE; @@ -575,8 +575,7 @@ acpi_rs_get_pci_routing_table_length(union acpi_operand_object *package_object, /* Was a String type found? */ if (name_found) { - if (ACPI_GET_OBJECT_TYPE(*sub_object_list) == - ACPI_TYPE_STRING) { + if ((*sub_object_list)->common.type == ACPI_TYPE_STRING) { /* * The length String.Length field does not include the * terminating NULL, add 1 diff --git a/drivers/acpi/acpica/rscreate.c b/drivers/acpi/acpica/rscreate.c index 61566b1a0616..663f692fffcf 100644 --- a/drivers/acpi/acpica/rscreate.c +++ b/drivers/acpi/acpica/rscreate.c @@ -212,7 +212,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, /* Each element of the top-level package must also be a package */ - if (ACPI_GET_OBJECT_TYPE(*top_object_list) != ACPI_TYPE_PACKAGE) { + if ((*top_object_list)->common.type != ACPI_TYPE_PACKAGE) { ACPI_ERROR((AE_INFO, "(PRT[%X]) Need sub-package, found %s", index, @@ -240,7 +240,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, /* 1) First subobject: Dereference the PRT.Address */ obj_desc = sub_object_list[0]; - if (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_INTEGER) { + if (obj_desc->common.type != ACPI_TYPE_INTEGER) { ACPI_ERROR((AE_INFO, "(PRT[%X].Address) Need Integer, found %s", index, @@ -253,7 +253,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, /* 2) Second subobject: Dereference the PRT.Pin */ obj_desc = sub_object_list[1]; - if (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_INTEGER) { + if (obj_desc->common.type != ACPI_TYPE_INTEGER) { ACPI_ERROR((AE_INFO, "(PRT[%X].Pin) Need Integer, found %s", index, @@ -265,7 +265,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, * If BIOS erroneously reversed the _PRT source_name and source_index, * then reverse them back. */ - if (ACPI_GET_OBJECT_TYPE(sub_object_list[3]) != + if ((sub_object_list[3])->common.type != ACPI_TYPE_INTEGER) { if (acpi_gbl_enable_interpreter_slack) { source_name_index = 3; @@ -291,8 +291,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, * other ACPI implementations. */ obj_desc = sub_object_list[3]; - if (!obj_desc - || (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_INTEGER)) { + if (!obj_desc || (obj_desc->common.type != ACPI_TYPE_INTEGER)) { sub_object_list[3] = sub_object_list[2]; sub_object_list[2] = obj_desc; @@ -307,7 +306,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, */ obj_desc = sub_object_list[source_name_index]; if (obj_desc) { - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_LOCAL_REFERENCE: if (obj_desc->reference.class != @@ -380,7 +379,7 @@ acpi_rs_create_pci_routing_table(union acpi_operand_object *package_object, /* 4) Fourth subobject: Dereference the PRT.source_index */ obj_desc = sub_object_list[source_index_index]; - if (ACPI_GET_OBJECT_TYPE(obj_desc) != ACPI_TYPE_INTEGER) { + if (obj_desc->common.type != ACPI_TYPE_INTEGER) { ACPI_ERROR((AE_INFO, "(PRT[%X].SourceIndex) Need Integer, found %s", index, diff --git a/drivers/acpi/acpica/utcopy.c b/drivers/acpi/acpica/utcopy.c index b0dcfd3c872a..cabe860ce007 100644 --- a/drivers/acpi/acpica/utcopy.c +++ b/drivers/acpi/acpica/utcopy.c @@ -135,11 +135,11 @@ acpi_ut_copy_isimple_to_esimple(union acpi_operand_object *internal_object, * In general, the external object will be the same type as * the internal object */ - external_object->type = ACPI_GET_OBJECT_TYPE(internal_object); + external_object->type = internal_object->common.type; /* However, only a limited number of external types are supported */ - switch (ACPI_GET_OBJECT_TYPE(internal_object)) { + switch (internal_object->common.type) { case ACPI_TYPE_STRING: external_object->string.pointer = (char *)data_space; @@ -222,8 +222,8 @@ acpi_ut_copy_isimple_to_esimple(union acpi_operand_object *internal_object, */ ACPI_ERROR((AE_INFO, "Unsupported object type, cannot convert to external object: %s", - acpi_ut_get_type_name(ACPI_GET_OBJECT_TYPE - (internal_object)))); + acpi_ut_get_type_name(internal_object->common. + type))); return_ACPI_STATUS(AE_SUPPORT); } @@ -355,7 +355,7 @@ acpi_ut_copy_ipackage_to_epackage(union acpi_operand_object *internal_object, info.object_space = 0; info.num_packages = 1; - external_object->type = ACPI_GET_OBJECT_TYPE(internal_object); + external_object->type = internal_object->common.type; external_object->package.count = internal_object->package.count; external_object->package.elements = ACPI_CAST_PTR(union acpi_object, info.free_space); @@ -399,7 +399,7 @@ acpi_ut_copy_iobject_to_eobject(union acpi_operand_object *internal_object, ACPI_FUNCTION_TRACE(ut_copy_iobject_to_eobject); - if (ACPI_GET_OBJECT_TYPE(internal_object) == ACPI_TYPE_PACKAGE) { + if (internal_object->common.type == ACPI_TYPE_PACKAGE) { /* * Package object: Copy all subobjects (including * nested packages) @@ -697,7 +697,7 @@ acpi_ut_copy_simple_object(union acpi_operand_object *source_desc, /* Handle the objects with extra data */ - switch (ACPI_GET_OBJECT_TYPE(dest_desc)) { + switch (dest_desc->common.type) { case ACPI_TYPE_BUFFER: /* * Allocate and copy the actual buffer if and only if: @@ -814,8 +814,8 @@ acpi_ut_copy_ielement_to_ielement(u8 object_type, * This is a simple object, just copy it */ target_object = - acpi_ut_create_internal_object(ACPI_GET_OBJECT_TYPE - (source_object)); + acpi_ut_create_internal_object(source_object-> + common.type); if (!target_object) { return (AE_NO_MEMORY); } @@ -892,7 +892,7 @@ acpi_ut_copy_ipackage_to_ipackage(union acpi_operand_object *source_obj, ACPI_FUNCTION_TRACE(ut_copy_ipackage_to_ipackage); - dest_obj->common.type = ACPI_GET_OBJECT_TYPE(source_obj); + dest_obj->common.type = source_obj->common.type; dest_obj->common.flags = source_obj->common.flags; dest_obj->package.count = source_obj->package.count; @@ -950,15 +950,14 @@ acpi_ut_copy_iobject_to_iobject(union acpi_operand_object *source_desc, /* Create the top level object */ - *dest_desc = - acpi_ut_create_internal_object(ACPI_GET_OBJECT_TYPE(source_desc)); + *dest_desc = acpi_ut_create_internal_object(source_desc->common.type); if (!*dest_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } /* Copy the object and possible subobjects */ - if (ACPI_GET_OBJECT_TYPE(source_desc) == ACPI_TYPE_PACKAGE) { + if (source_desc->common.type == ACPI_TYPE_PACKAGE) { status = acpi_ut_copy_ipackage_to_ipackage(source_desc, *dest_desc, walk_state); diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c index a0be9e39531e..a5ee23bc4f55 100644 --- a/drivers/acpi/acpica/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -86,7 +86,7 @@ static void acpi_ut_delete_internal_obj(union acpi_operand_object *object) * Must delete or free any pointers within the object that are not * actual ACPI objects (for example, a raw buffer pointer). */ - switch (ACPI_GET_OBJECT_TYPE(object)) { + switch (object->common.type) { case ACPI_TYPE_STRING: ACPI_DEBUG_PRINT((ACPI_DB_ALLOCATIONS, @@ -382,7 +382,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) object, new_count)); } - if (ACPI_GET_OBJECT_TYPE(object) == ACPI_TYPE_METHOD) { + if (object->common.type == ACPI_TYPE_METHOD) { ACPI_DEBUG_PRINT((ACPI_DB_ALLOCATIONS, "Method Obj %p Refs=%X, [Decremented]\n", object, new_count)); @@ -469,7 +469,7 @@ acpi_ut_update_object_reference(union acpi_operand_object *object, u16 action) * All sub-objects must have their reference count incremented also. * Different object types have different subobjects. */ - switch (ACPI_GET_OBJECT_TYPE(object)) { + switch (object->common.type) { case ACPI_TYPE_DEVICE: case ACPI_TYPE_PROCESSOR: case ACPI_TYPE_POWER: diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c index 9c9897dbe907..99bfbd23258b 100644 --- a/drivers/acpi/acpica/uteval.c +++ b/drivers/acpi/acpica/uteval.c @@ -248,7 +248,7 @@ acpi_ut_evaluate_object(struct acpi_namespace_node *prefix_node, /* Map the return object type to the bitmapped type */ - switch (ACPI_GET_OBJECT_TYPE(info->return_object)) { + switch ((info->return_object)->common.type) { case ACPI_TYPE_INTEGER: return_btype = ACPI_BTYPE_INTEGER; break; @@ -418,7 +418,7 @@ acpi_ut_execute_HID(struct acpi_namespace_node *device_node, return_ACPI_STATUS(status); } - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_INTEGER) { + if (obj_desc->common.type == ACPI_TYPE_INTEGER) { /* Convert the Numeric HID to string */ @@ -459,7 +459,7 @@ acpi_ut_translate_one_cid(union acpi_operand_object *obj_desc, struct acpi_compatible_id *one_cid) { - switch (ACPI_GET_OBJECT_TYPE(obj_desc)) { + switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: /* Convert the Numeric CID to string */ @@ -527,7 +527,7 @@ acpi_ut_execute_CID(struct acpi_namespace_node * device_node, /* Get the number of _CIDs returned */ count = 1; - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_PACKAGE) { + if (obj_desc->common.type == ACPI_TYPE_PACKAGE) { count = obj_desc->package.count; } @@ -555,7 +555,7 @@ acpi_ut_execute_CID(struct acpi_namespace_node * device_node, /* The _CID object can be either a single CID or a package (list) of CIDs */ - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_PACKAGE) { + if (obj_desc->common.type == ACPI_TYPE_PACKAGE) { /* Translate each package element */ @@ -620,7 +620,7 @@ acpi_ut_execute_UID(struct acpi_namespace_node *device_node, return_ACPI_STATUS(status); } - if (ACPI_GET_OBJECT_TYPE(obj_desc) == ACPI_TYPE_INTEGER) { + if (obj_desc->common.type == ACPI_TYPE_INTEGER) { /* Convert the Numeric UID to string */ diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index bec0e21673cb..7fc35d33adb1 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -473,7 +473,7 @@ char *acpi_ut_get_object_type_name(union acpi_operand_object *obj_desc) return ("[NULL Object Descriptor]"); } - return (acpi_ut_get_type_name(ACPI_GET_OBJECT_TYPE(obj_desc))); + return (acpi_ut_get_type_name(obj_desc->common.type)); } /******************************************************************************* diff --git a/drivers/acpi/acpica/utmisc.c b/drivers/acpi/acpica/utmisc.c index c1f7f4e1a72d..1c9e250caefb 100644 --- a/drivers/acpi/acpica/utmisc.c +++ b/drivers/acpi/acpica/utmisc.c @@ -938,8 +938,7 @@ acpi_ut_walk_package_tree(union acpi_operand_object * source_object, if ((!this_source_obj) || (ACPI_GET_DESCRIPTOR_TYPE(this_source_obj) != ACPI_DESC_TYPE_OPERAND) - || (ACPI_GET_OBJECT_TYPE(this_source_obj) != - ACPI_TYPE_PACKAGE)) { + || (this_source_obj->common.type != ACPI_TYPE_PACKAGE)) { status = walk_callback(ACPI_COPY_TYPE_SIMPLE, this_source_obj, state, context); diff --git a/drivers/acpi/acpica/utobject.c b/drivers/acpi/acpica/utobject.c index fd5ea7543e5b..ae337a717438 100644 --- a/drivers/acpi/acpica/utobject.c +++ b/drivers/acpi/acpica/utobject.c @@ -457,7 +457,7 @@ acpi_ut_get_simple_object_size(union acpi_operand_object *internal_object, * must be accessed bytewise or there may be alignment problems on * certain processors */ - switch (ACPI_GET_OBJECT_TYPE(internal_object)) { + switch (internal_object->common.type) { case ACPI_TYPE_STRING: length += (acpi_size) internal_object->string.length + 1; @@ -518,8 +518,7 @@ acpi_ut_get_simple_object_size(union acpi_operand_object *internal_object, ACPI_ERROR((AE_INFO, "Cannot convert to external object - " "unsupported type [%s] %X in object %p", acpi_ut_get_object_type_name(internal_object), - ACPI_GET_OBJECT_TYPE(internal_object), - internal_object)); + internal_object->common.type, internal_object)); status = AE_TYPE; break; } @@ -664,7 +663,7 @@ acpi_ut_get_object_size(union acpi_operand_object *internal_object, if ((ACPI_GET_DESCRIPTOR_TYPE(internal_object) == ACPI_DESC_TYPE_OPERAND) - && (ACPI_GET_OBJECT_TYPE(internal_object) == ACPI_TYPE_PACKAGE)) { + && (internal_object->common.type == ACPI_TYPE_PACKAGE)) { status = acpi_ut_get_package_object_size(internal_object, obj_length); -- cgit v1.2.3-59-g8ed1b From 4f70e371cdf6ab4f988fbaf2257e6259422ba662 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:52:43 +0800 Subject: ACPICA: Conditionally compile acpi_set_firmware_waking_vector64 This function is only needed on 64-bit host operating systems. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwsleep.c | 5 ++++- include/acpi/acpixf.h | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 51868f48159c..26e249e69ead 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -90,6 +90,7 @@ acpi_set_firmware_waking_vector(u32 physical_address) ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector) +#if ACPI_MACHINE_WIDTH == 64 /******************************************************************************* * * FUNCTION: acpi_set_firmware_waking_vector64 @@ -100,7 +101,8 @@ ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector) * RETURN: Status * * DESCRIPTION: Sets the 64-bit X_firmware_waking_vector field of the FACS, if - * it exists in the table. + * it exists in the table. This function is intended for use with + * 64-bit host operating systems. * ******************************************************************************/ acpi_status @@ -124,6 +126,7 @@ acpi_set_firmware_waking_vector64(u64 physical_address) } ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector64) +#endif /******************************************************************************* * diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 8f9df6235255..e10c89691043 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -351,11 +351,11 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value); acpi_status acpi_set_register(u32 register_id, u32 value); -acpi_status -acpi_set_firmware_waking_vector(u32 physical_address); +acpi_status acpi_set_firmware_waking_vector(u32 physical_address); -acpi_status -acpi_set_firmware_waking_vector64(u64 physical_address); +#if ACPI_MACHINE_WIDTH == 64 +acpi_status acpi_set_firmware_waking_vector64(u64 physical_address); +#endif acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg); -- cgit v1.2.3-59-g8ed1b From c114e4b6c606c7f174b752f946fcfb0e7e61a347 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 23 Feb 2009 11:00:00 +0800 Subject: ACPICA: Debug output: print result of _OSI invocations Print input strings and the result (supported or not supported) for invocations of the _OSI method. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/uteval.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c index 99bfbd23258b..9c4ae6f26b9d 100644 --- a/drivers/acpi/acpica/uteval.c +++ b/drivers/acpi/acpica/uteval.c @@ -98,6 +98,7 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) acpi_status status; union acpi_operand_object *string_desc; union acpi_operand_object *return_desc; + u32 return_value; u32 i; ACPI_FUNCTION_TRACE(ut_osi_implementation); @@ -116,10 +117,9 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) return_ACPI_STATUS(AE_NO_MEMORY); } - /* Default return value is 0, NOT-SUPPORTED */ + /* Default return value is 0, NOT SUPPORTED */ - return_desc->integer.value = 0; - walk_state->return_desc = return_desc; + return_value = 0; /* Compare input string to static table of supported interfaces */ @@ -127,8 +127,11 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) if (!ACPI_STRCMP (string_desc->string.pointer, acpi_interfaces_supported[i])) { - return_desc->integer.value = ACPI_UINT32_MAX; - goto done; + + /* The interface is supported */ + + return_value = ACPI_UINT32_MAX; + goto exit; } } @@ -139,15 +142,22 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) */ status = acpi_os_validate_interface(string_desc->string.pointer); if (ACPI_SUCCESS(status)) { - return_desc->integer.value = ACPI_UINT32_MAX; + + /* The interface is supported */ + + return_value = ACPI_UINT32_MAX; } -done: - ACPI_DEBUG_PRINT_RAW((ACPI_DB_INFO, "ACPI: BIOS _OSI(%s) %ssupported\n", - string_desc->string.pointer, - return_desc->integer.value == 0 ? "not-" : "")); +exit: + ACPI_DEBUG_PRINT_RAW ((ACPI_DB_INFO, + "ACPI: BIOS _OSI(%s) is %ssupported\n", + string_desc->string.pointer, return_value == 0 ? "not " : "")); - return_ACPI_STATUS(AE_OK); + /* Complete the return value */ + + return_desc->integer.value = return_value; + walk_state->return_desc = return_desc; + return_ACPI_STATUS (AE_OK); } /******************************************************************************* -- cgit v1.2.3-59-g8ed1b From 7bcc06e845479bde939059bafa088bf25ede9dbf Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 14:58:08 +0800 Subject: ACPICA: Debug output: decrease verbosity of DB_INFO debug level Removed some of the extraneous debug prints using the DB_INFO level. This should make the DB_INFO more useful. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/nseval.c | 2 +- drivers/acpi/acpica/nsutils.c | 2 +- drivers/acpi/acpica/utobject.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpica/nseval.c b/drivers/acpi/acpica/nseval.c index 0f3d5f9b5966..8e7dec1176c9 100644 --- a/drivers/acpi/acpica/nseval.c +++ b/drivers/acpi/acpica/nseval.c @@ -155,7 +155,7 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info) } - ACPI_DUMP_PATHNAME(info->resolved_node, "Execute Method:", + ACPI_DUMP_PATHNAME(info->resolved_node, "ACPI: Execute Method", ACPI_LV_INFO, _COMPONENT); ACPI_DEBUG_PRINT((ACPI_DB_EXEC, diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c index 3e1149bf4aa5..d30b0e65ab3c 100644 --- a/drivers/acpi/acpica/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -872,7 +872,7 @@ acpi_ns_get_node(struct acpi_namespace_node *prefix_node, (flags | ACPI_NS_DONT_OPEN_SCOPE), NULL, return_node); if (ACPI_FAILURE(status)) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "%s, %s\n", + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "%s, %s\n", pathname, acpi_format_exception(status))); } diff --git a/drivers/acpi/acpica/utobject.c b/drivers/acpi/acpica/utobject.c index ae337a717438..0207b625274a 100644 --- a/drivers/acpi/acpica/utobject.c +++ b/drivers/acpi/acpica/utobject.c @@ -310,7 +310,7 @@ u8 acpi_ut_valid_internal_object(void *object) /* Check for a null pointer */ if (!object) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "**** Null Object Ptr\n")); + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "**** Null Object Ptr\n")); return (FALSE); } @@ -324,7 +324,7 @@ u8 acpi_ut_valid_internal_object(void *object) return (TRUE); default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "%p is not not an ACPI operand obj [%s]\n", object, acpi_ut_get_descriptor_name(object))); break; -- cgit v1.2.3-59-g8ed1b From ec41f193eadb6301f3c052b5e0dbc0b5636982e8 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 15:03:30 +0800 Subject: ACPICA: Formatting update - no functional changes Split some long lines. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/tbfadt.c | 33 ++++++++++++++++++++------------- drivers/acpi/acpica/tbinstal.c | 9 +++++---- drivers/acpi/acpica/tbutils.c | 17 +++++++++-------- drivers/acpi/acpica/tbxface.c | 37 ++++++++++++++++--------------------- drivers/acpi/acpica/tbxfroot.c | 4 ++-- drivers/acpi/acpica/utcopy.c | 5 +++-- drivers/acpi/acpica/uteval.c | 5 ++--- drivers/acpi/acpica/utglobal.c | 5 ++++- 8 files changed, 61 insertions(+), 54 deletions(-) diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 43fe886b41a2..af8fbe12d8b7 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -132,7 +132,8 @@ static struct acpi_fadt_info fadt_info_table[] = { ACPI_FADT_SEPARATE_LENGTH} }; -#define ACPI_FADT_INFO_ENTRIES (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info)) +#define ACPI_FADT_INFO_ENTRIES \ + (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info)) /* Table used to split Event Blocks into separate status/enable registers */ @@ -161,7 +162,8 @@ static struct acpi_fadt_pm_info fadt_pm_info_table[] = { 1} }; -#define ACPI_FADT_PM_INFO_ENTRIES (sizeof (fadt_pm_info_table) / sizeof (struct acpi_fadt_pm_info)) +#define ACPI_FADT_PM_INFO_ENTRIES \ + (sizeof (fadt_pm_info_table) / sizeof (struct acpi_fadt_pm_info)) /******************************************************************************* * @@ -416,7 +418,7 @@ static void acpi_tb_convert_fadt(void) } } -/****************************************************************************** +/******************************************************************************* * * FUNCTION: acpi_tb_validate_fadt * @@ -503,7 +505,8 @@ static void acpi_tb_validate_fadt(void) */ if (!address64->address || !length) { ACPI_ERROR((AE_INFO, - "Required field %s has zero address and/or length: %8.8X%8.8X/%X", + "Required field %s has zero address and/or length:" + " %8.8X%8.8X/%X", name, ACPI_FORMAT_UINT64(address64-> address), @@ -512,12 +515,14 @@ static void acpi_tb_validate_fadt(void) } else if (fadt_info_table[i].type & ACPI_FADT_SEPARATE_LENGTH) { /* * Field is optional (PM2Control, GPE0, GPE1) AND has its own - * length field. If present, both the address and length must be valid. + * length field. If present, both the address and length must + * be valid. */ - if ((address64->address && !length) - || (!address64->address && length)) { + if ((address64->address && !length) || + (!address64->address && length)) { ACPI_WARNING((AE_INFO, - "Optional field %s has zero address or length: %8.8X%8.8X/%X", + "Optional field %s has zero address or length: " + "%8.8X%8.8X/%X", name, ACPI_FORMAT_UINT64(address64-> address), @@ -525,8 +530,10 @@ static void acpi_tb_validate_fadt(void) } } - /* If both 32- and 64-bit addresses are valid (non-zero), they must match */ - + /* + * If both 32- and 64-bit addresses are valid (non-zero), + * they must match + */ if (address64->address && *address32 && (address64->address != (u64) * address32)) { ACPI_ERROR((AE_INFO, @@ -537,7 +544,7 @@ static void acpi_tb_validate_fadt(void) } } -/****************************************************************************** +/******************************************************************************* * * FUNCTION: acpi_tb_setup_fadt_registers * @@ -596,8 +603,8 @@ static void acpi_tb_setup_fadt_registers(void) * Each register is defined to be (event block length / 2). Extra divide * by 8 converts bits to bytes. */ - pm1_register_byte_width = - (u8)ACPI_DIV_16(acpi_gbl_FADT.xpm1a_event_block.bit_width); + pm1_register_byte_width = (u8) + ACPI_DIV_16(acpi_gbl_FADT.xpm1a_event_block.bit_width); /* * Calculate separate GAS structs for the PM1x (A/B) Status and Enable diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index ef269a297b57..c37993003f2c 100644 --- a/drivers/acpi/acpica/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -273,8 +273,9 @@ acpi_status acpi_tb_resize_root_table_list(void) /* Increase the Table Array size */ tables = ACPI_ALLOCATE_ZEROED(((acpi_size) acpi_gbl_root_table_list. - size + ACPI_ROOT_TABLE_SIZE_INCREMENT) - * sizeof(struct acpi_table_desc)); + size + + ACPI_ROOT_TABLE_SIZE_INCREMENT) * + sizeof(struct acpi_table_desc)); if (!tables) { ACPI_ERROR((AE_INFO, "Could not allocate new root table array")); @@ -561,8 +562,8 @@ u8 acpi_tb_is_table_loaded(u32 table_index) (void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES); if (table_index < acpi_gbl_root_table_list.count) { is_loaded = (u8) - (acpi_gbl_root_table_list.tables[table_index]. - flags & ACPI_TABLE_IS_LOADED); + (acpi_gbl_root_table_list.tables[table_index].flags & + ACPI_TABLE_IS_LOADED); } (void)acpi_ut_release_mutex(ACPI_MTX_TABLES); diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index a0b424356b98..dc45b49e5cb9 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -413,7 +413,8 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 table_entry_size) } else { /* * 32-bit platform, XSDT: Truncate 64-bit to 32-bit and return - * 64-bit platform, XSDT: Move (unaligned) 64-bit to local, return 64-bit + * 64-bit platform, XSDT: Move (unaligned) 64-bit to local, + * return 64-bit */ ACPI_MOVE_64_TO_64(&address64, table_entry); @@ -423,7 +424,8 @@ acpi_tb_get_root_table_entry(u8 *table_entry, u32 table_entry_size) /* Will truncate 64-bit address to 32 bits, issue warning */ ACPI_WARNING((AE_INFO, - "64-bit Physical Address in XSDT is too large (%8.8X%8.8X), truncating", + "64-bit Physical Address in XSDT is too large (%8.8X%8.8X)," + " truncating", ACPI_FORMAT_UINT64(address64))); } #endif @@ -546,13 +548,12 @@ acpi_tb_parse_root_table(acpi_physical_address rsdp_address) /* Calculate the number of tables described in the root table */ - table_count = - (u32) ((table->length - - sizeof(struct acpi_table_header)) / table_entry_size); - + table_count = (u32)((table->length - sizeof(struct acpi_table_header)) / + table_entry_size); /* - * First two entries in the table array are reserved for the DSDT and FACS, - * which are not actually present in the RSDT/XSDT - they come from the FADT + * First two entries in the table array are reserved for the DSDT + * and FACS, which are not actually present in the RSDT/XSDT - they + * come from the FADT */ table_entry = ACPI_CAST_PTR(u8, table) + sizeof(struct acpi_table_header); diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c index 416d01d9a970..dbca22651504 100644 --- a/drivers/acpi/acpica/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -246,7 +246,7 @@ acpi_status acpi_load_table(struct acpi_table_header *table_ptr) ACPI_EXPORT_SYMBOL(acpi_load_table) -/****************************************************************************** +/******************************************************************************* * * FUNCTION: acpi_get_table_header * @@ -261,7 +261,7 @@ ACPI_EXPORT_SYMBOL(acpi_load_table) * NOTE: Caller is responsible in unmapping the header with * acpi_os_unmap_memory * - *****************************************************************************/ + ******************************************************************************/ acpi_status acpi_get_table_header(char *signature, u32 instance, struct acpi_table_header *out_table_header) @@ -276,9 +276,8 @@ acpi_get_table_header(char *signature, return (AE_BAD_PARAMETER); } - /* - * Walk the root table list - */ + /* Walk the root table list */ + for (i = 0, j = 0; i < acpi_gbl_root_table_list.count; i++) { if (!ACPI_COMPARE_NAME (&(acpi_gbl_root_table_list.tables[i].signature), @@ -291,8 +290,8 @@ acpi_get_table_header(char *signature, } if (!acpi_gbl_root_table_list.tables[i].pointer) { - if ((acpi_gbl_root_table_list.tables[i]. - flags & ACPI_TABLE_ORIGIN_MASK) == + if ((acpi_gbl_root_table_list.tables[i].flags & + ACPI_TABLE_ORIGIN_MASK) == ACPI_TABLE_ORIGIN_MAPPED) { header = acpi_os_map_memory(acpi_gbl_root_table_list. @@ -323,7 +322,7 @@ acpi_get_table_header(char *signature, ACPI_EXPORT_SYMBOL(acpi_get_table_header) -/****************************************************************************** +/******************************************************************************* * * FUNCTION: acpi_unload_table_id * @@ -374,7 +373,7 @@ ACPI_EXPORT_SYMBOL(acpi_unload_table_id) * * DESCRIPTION: Finds and verifies an ACPI table. * - *****************************************************************************/ + ******************************************************************************/ acpi_status acpi_get_table(char *signature, u32 instance, struct acpi_table_header **out_table) @@ -389,9 +388,8 @@ acpi_get_table(char *signature, return (AE_BAD_PARAMETER); } - /* - * Walk the root table list - */ + /* Walk the root table list */ + for (i = 0, j = 0; i < acpi_gbl_root_table_list.count; i++) { if (!ACPI_COMPARE_NAME (&(acpi_gbl_root_table_list.tables[i].signature), @@ -526,17 +524,15 @@ static acpi_status acpi_tb_load_namespace(void) (void)acpi_ut_release_mutex(ACPI_MTX_TABLES); - /* - * Load and parse tables. - */ + /* Load and parse tables */ + status = acpi_ns_load_table(ACPI_TABLE_INDEX_DSDT, acpi_gbl_root_node); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } - /* - * Load any SSDT or PSDT tables. Note: Loop leaves tables locked - */ + /* Load any SSDT or PSDT tables. Note: Loop leaves tables locked */ + (void)acpi_ut_acquire_mutex(ACPI_MTX_TABLES); for (i = 0; i < acpi_gbl_root_table_list.count; ++i) { if ((!ACPI_COMPARE_NAME @@ -589,9 +585,8 @@ acpi_status acpi_load_tables(void) ACPI_FUNCTION_TRACE(acpi_load_tables); - /* - * Load the namespace from the tables - */ + /* Load the namespace from the tables */ + status = acpi_tb_load_namespace(); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c index b7fc8dd43341..85ea834199e2 100644 --- a/drivers/acpi/acpica/tbxfroot.c +++ b/drivers/acpi/acpica/tbxfroot.c @@ -75,8 +75,8 @@ static acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp) * Note: Sometimes there exists more than one RSDP in memory; the valid * RSDP has a valid checksum, all others have an invalid checksum. */ - if (ACPI_STRNCMP((char *)rsdp, ACPI_SIG_RSDP, sizeof(ACPI_SIG_RSDP) - 1) - != 0) { + if (ACPI_STRNCMP((char *)rsdp, ACPI_SIG_RSDP, + sizeof(ACPI_SIG_RSDP) - 1) != 0) { /* Nope, BAD Signature */ diff --git a/drivers/acpi/acpica/utcopy.c b/drivers/acpi/acpica/utcopy.c index cabe860ce007..919624f123d5 100644 --- a/drivers/acpi/acpica/utcopy.c +++ b/drivers/acpi/acpica/utcopy.c @@ -496,8 +496,9 @@ acpi_ut_copy_esimple_to_isimple(union acpi_object *external_object, case ACPI_TYPE_STRING: internal_object->string.pointer = - ACPI_ALLOCATE_ZEROED((acpi_size) external_object->string. - length + 1); + ACPI_ALLOCATE_ZEROED((acpi_size) + external_object->string.length + 1); + if (!internal_object->string.pointer) { goto error_exit; } diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c index 9c4ae6f26b9d..3b9152579d04 100644 --- a/drivers/acpi/acpica/uteval.c +++ b/drivers/acpi/acpica/uteval.c @@ -124,9 +124,8 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) /* Compare input string to static table of supported interfaces */ for (i = 0; i < ACPI_ARRAY_LENGTH(acpi_interfaces_supported); i++) { - if (!ACPI_STRCMP - (string_desc->string.pointer, - acpi_interfaces_supported[i])) { + if (!ACPI_STRCMP(string_desc->string.pointer, + acpi_interfaces_supported[i])) { /* The interface is supported */ diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index 7fc35d33adb1..256ce7778565 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -746,7 +746,10 @@ acpi_status acpi_ut_init_globals(void) for (i = 0; i < ACPI_NUM_OWNERID_MASKS; i++) { acpi_gbl_owner_id_mask[i] = 0; } - acpi_gbl_owner_id_mask[ACPI_NUM_OWNERID_MASKS - 1] = 0x80000000; /* Last ID is never valid */ + + /* Last owner_iD is never valid */ + + acpi_gbl_owner_id_mask[ACPI_NUM_OWNERID_MASKS - 1] = 0x80000000; /* GPE support */ -- cgit v1.2.3-59-g8ed1b From 9892dd23cbbfab1f7d4818622296e415979a9c77 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 15:10:07 +0800 Subject: ACPICA: Optimize ACPI register locking Removed locking for reads from the ACPI bit registers in PM1 Status, Enable, Control, and PM2 Control. The lock is not required when reading the single-bit registers. The acpi_get_register_unlocked function is no longer needed and has been removed. This will improve performance for reads on these registers. ACPICA BZ 760. http://www.acpica.org/bugzilla/show_bug.cgi?id=760 Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/acpi/acpica/hwxface.c | 65 ++++++++++++++++++------------------------- drivers/acpi/processor_idle.c | 2 +- include/acpi/acpixf.h | 2 -- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 26e249e69ead..677ccb6bceea 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -365,7 +365,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) /* Wait until we enter sleep state */ do { - status = acpi_get_register_unlocked(ACPI_BITREG_WAKE_STATUS, + status = acpi_get_register(ACPI_BITREG_WAKE_STATUS, &in_value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index 4df9eacb7c88..c8100199634b 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -242,24 +242,35 @@ ACPI_EXPORT_SYMBOL(acpi_write) /******************************************************************************* * - * FUNCTION: acpi_get_register_unlocked + * FUNCTION: acpi_get_register * - * PARAMETERS: register_id - ID of ACPI bit_register to access - * return_value - Value that was read from the register + * PARAMETERS: register_id - ID of ACPI Bit Register to access + * return_value - Value that was read from the register, + * normalized to bit position zero. * - * RETURN: Status and the value read from specified Register. Value + * RETURN: Status and the value read from the specified Register. Value * returned is normalized to bit0 (is shifted all the way right) * * DESCRIPTION: ACPI bit_register read function. Does not acquire the HW lock. * + * SUPPORTS: Bit fields in PM1 Status, PM1 Enable, PM1 Control, and + * PM2 Control. + * + * Note: The hardware lock is not required when reading the ACPI bit registers + * since almost all of them are single bit and it does not matter that + * the parent hardware register can be split across two physical + * registers. The only multi-bit field is SLP_TYP in the PM1 control + * register, but this field does not cross an 8-bit boundary (nor does + * it make much sense to actually read this field.) + * ******************************************************************************/ -acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value) +acpi_status acpi_get_register(u32 register_id, u32 *return_value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; - ACPI_FUNCTION_TRACE(acpi_get_register_unlocked); + ACPI_FUNCTION_TRACE(acpi_get_register); /* Get the info structure corresponding to the requested ACPI Register */ @@ -268,7 +279,7 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value) return_ACPI_STATUS(AE_BAD_PARAMETER); } - /* Read from the register */ + /* Read the entire parent register */ status = acpi_hw_register_read(bit_reg_info->parent_register, ®ister_value); @@ -291,46 +302,24 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value) return_ACPI_STATUS(status); } -ACPI_EXPORT_SYMBOL(acpi_get_register_unlocked) - -/******************************************************************************* - * - * FUNCTION: acpi_get_register - * - * PARAMETERS: register_id - ID of ACPI bit_register to access - * return_value - Value that was read from the register - * - * RETURN: Status and the value read from specified Register. Value - * returned is normalized to bit0 (is shifted all the way right) - * - * DESCRIPTION: ACPI bit_register read function. - * - ******************************************************************************/ -acpi_status acpi_get_register(u32 register_id, u32 *return_value) -{ - acpi_status status; - acpi_cpu_flags flags; - - flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); - status = acpi_get_register_unlocked(register_id, return_value); - acpi_os_release_lock(acpi_gbl_hardware_lock, flags); - - return (status); -} - ACPI_EXPORT_SYMBOL(acpi_get_register) /******************************************************************************* * * FUNCTION: acpi_set_register * - * PARAMETERS: register_id - ID of ACPI bit_register to access - * Value - (only used on write) value to write to the - * Register, NOT pre-normalized to the bit pos + * PARAMETERS: register_id - ID of ACPI Bit Register to access + * Value - Value to write to the register, in bit + * position zero. The bit is automaticallly + * shifted to the correct position. * * RETURN: Status * - * DESCRIPTION: ACPI Bit Register write function. + * DESCRIPTION: ACPI Bit Register write function. Acquires the hardware lock + * since most operations require a read/modify/write sequence. + * + * SUPPORTS: Bit fields in PM1 Status, PM1 Enable, PM1 Control, and + * PM2 Control. * ******************************************************************************/ acpi_status acpi_set_register(u32 register_id, u32 value) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 7bc22a471fe3..6946047d0096 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -800,7 +800,7 @@ static int acpi_idle_bm_check(void) { u32 bm_status = 0; - acpi_get_register_unlocked(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); + acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); if (bm_status) acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); /* diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index e10c89691043..325d4b073aca 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -347,8 +347,6 @@ acpi_status acpi_reset(void); acpi_status acpi_get_register(u32 register_id, u32 * return_value); -acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value); - acpi_status acpi_set_register(u32 register_id, u32 value); acpi_status acpi_set_firmware_waking_vector(u32 physical_address); -- cgit v1.2.3-59-g8ed1b From 50ffba1bd3120b069617455545bc27bcf3cf7579 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 23 Feb 2009 15:02:07 +0800 Subject: ACPICA: Rename ACPI bit register access functions Rename acpi_get_register and acpi_set_register to clarify the purpose of these functions. New names are acpi_read_bit_register and acpi_write_bit_register. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/evevent.c | 12 ++++++------ drivers/acpi/acpica/evmisc.c | 4 ++-- drivers/acpi/acpica/evxfevnt.c | 24 ++++++++++++------------ drivers/acpi/acpica/hwacpi.c | 2 +- drivers/acpi/acpica/hwsleep.c | 15 ++++++++------- drivers/acpi/acpica/hwxface.c | 16 ++++++++-------- drivers/acpi/processor_idle.c | 10 +++++----- drivers/acpi/sleep.c | 2 +- include/acpi/acpixf.h | 4 ++-- 9 files changed, 45 insertions(+), 44 deletions(-) diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c index 803edd9e3f6a..246f8775ec13 100644 --- a/drivers/acpi/acpica/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -204,8 +204,8 @@ static acpi_status acpi_ev_fixed_event_initialize(void) if (acpi_gbl_fixed_event_info[i].enable_register_id != 0xFF) { status = - acpi_set_register(acpi_gbl_fixed_event_info[i]. - enable_register_id, 0); + acpi_write_bit_register(acpi_gbl_fixed_event_info + [i].enable_register_id, 0); if (ACPI_FAILURE(status)) { return (status); } @@ -288,16 +288,16 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event) /* Clear the status bit */ - (void)acpi_set_register(acpi_gbl_fixed_event_info[event]. - status_register_id, 1); + (void)acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. + status_register_id, 1); /* * Make sure we've got a handler. If not, report an error. The event is * disabled to prevent further interrupts. */ if (NULL == acpi_gbl_fixed_event_handlers[event].handler) { - (void)acpi_set_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, 0); + (void)acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. + enable_register_id, 0); ACPI_ERROR((AE_INFO, "No installed handler for fixed event [%08X]", diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c index 5f893057bcc6..0e9e12b2f2bb 100644 --- a/drivers/acpi/acpica/evmisc.c +++ b/drivers/acpi/acpica/evmisc.c @@ -534,8 +534,8 @@ acpi_status acpi_ev_release_global_lock(void) */ if (pending) { status = - acpi_set_register(ACPI_BITREG_GLOBAL_LOCK_RELEASE, - 1); + acpi_write_bit_register + (ACPI_BITREG_GLOBAL_LOCK_RELEASE, 1); } ACPI_DEBUG_PRINT((ACPI_DB_EXEC, diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index 35485e4b60a6..484c9793ca06 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -172,8 +172,8 @@ acpi_status acpi_enable_event(u32 event, u32 flags) * register bit) */ status = - acpi_set_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, 1); + acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. + enable_register_id, 1); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -181,8 +181,8 @@ acpi_status acpi_enable_event(u32 event, u32 flags) /* Make sure that the hardware responded */ status = - acpi_get_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, &value); + acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. + enable_register_id, &value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -354,15 +354,15 @@ acpi_status acpi_disable_event(u32 event, u32 flags) * register bit) */ status = - acpi_set_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, 0); + acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. + enable_register_id, 0); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } status = - acpi_get_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, &value); + acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. + enable_register_id, &value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -407,8 +407,8 @@ acpi_status acpi_clear_event(u32 event) * register bit) */ status = - acpi_set_register(acpi_gbl_fixed_event_info[event]. - status_register_id, 1); + acpi_write_bit_register(acpi_gbl_fixed_event_info[event]. + status_register_id, 1); return_ACPI_STATUS(status); } @@ -495,7 +495,7 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) /* Get the status of the requested fixed event */ status = - acpi_get_register(acpi_gbl_fixed_event_info[event]. + acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. enable_register_id, &value); if (ACPI_FAILURE(status)) return_ACPI_STATUS(status); @@ -503,7 +503,7 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) *event_status = value; status = - acpi_get_register(acpi_gbl_fixed_event_info[event]. + acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. status_register_id, &value); if (ACPI_FAILURE(status)) return_ACPI_STATUS(status); diff --git a/drivers/acpi/acpica/hwacpi.c b/drivers/acpi/acpica/hwacpi.c index a9d4fea4167f..8902db8c670f 100644 --- a/drivers/acpi/acpica/hwacpi.c +++ b/drivers/acpi/acpica/hwacpi.c @@ -172,7 +172,7 @@ u32 acpi_hw_get_mode(void) return_UINT32(ACPI_SYS_MODE_ACPI); } - status = acpi_get_register(ACPI_BITREG_SCI_ENABLE, &value); + status = acpi_read_bit_register(ACPI_BITREG_SCI_ENABLE, &value); if (ACPI_FAILURE(status)) { return_UINT32(ACPI_SYS_MODE_LEGACY); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 677ccb6bceea..78d62b8a5c31 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -250,7 +250,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) /* Clear wake status */ - status = acpi_set_register(ACPI_BITREG_WAKE_STATUS, 1); + status = acpi_write_bit_register(ACPI_BITREG_WAKE_STATUS, 1); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -365,7 +365,7 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) /* Wait until we enter sleep state */ do { - status = acpi_get_register(ACPI_BITREG_WAKE_STATUS, + status = acpi_read_bit_register(ACPI_BITREG_WAKE_STATUS, &in_value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); @@ -399,7 +399,7 @@ acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void) ACPI_FUNCTION_TRACE(acpi_enter_sleep_state_s4bios); - status = acpi_set_register(ACPI_BITREG_WAKE_STATUS, 1); + status = acpi_write_bit_register(ACPI_BITREG_WAKE_STATUS, 1); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -431,7 +431,8 @@ acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void) do { acpi_os_stall(1000); - status = acpi_get_register(ACPI_BITREG_WAKE_STATUS, &in_value); + status = + acpi_read_bit_register(ACPI_BITREG_WAKE_STATUS, &in_value); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } @@ -592,18 +593,18 @@ acpi_status acpi_leave_sleep_state(u8 sleep_state) * it to determine whether the system is rebooting or resuming. Clear * it for compatibility. */ - acpi_set_register(ACPI_BITREG_WAKE_STATUS, 1); + acpi_write_bit_register(ACPI_BITREG_WAKE_STATUS, 1); acpi_gbl_system_awake_and_running = TRUE; /* Enable power button */ (void) - acpi_set_register(acpi_gbl_fixed_event_info + acpi_write_bit_register(acpi_gbl_fixed_event_info [ACPI_EVENT_POWER_BUTTON].enable_register_id, 1); (void) - acpi_set_register(acpi_gbl_fixed_event_info + acpi_write_bit_register(acpi_gbl_fixed_event_info [ACPI_EVENT_POWER_BUTTON].status_register_id, 1); arg.integer.value = ACPI_SST_WORKING; diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index c8100199634b..529251c7f911 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -242,7 +242,7 @@ ACPI_EXPORT_SYMBOL(acpi_write) /******************************************************************************* * - * FUNCTION: acpi_get_register + * FUNCTION: acpi_read_bit_register * * PARAMETERS: register_id - ID of ACPI Bit Register to access * return_value - Value that was read from the register, @@ -264,13 +264,13 @@ ACPI_EXPORT_SYMBOL(acpi_write) * it make much sense to actually read this field.) * ******************************************************************************/ -acpi_status acpi_get_register(u32 register_id, u32 *return_value) +acpi_status acpi_read_bit_register(u32 register_id, u32 *return_value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; - ACPI_FUNCTION_TRACE(acpi_get_register); + ACPI_FUNCTION_TRACE(acpi_read_bit_register); /* Get the info structure corresponding to the requested ACPI Register */ @@ -302,11 +302,11 @@ acpi_status acpi_get_register(u32 register_id, u32 *return_value) return_ACPI_STATUS(status); } -ACPI_EXPORT_SYMBOL(acpi_get_register) +ACPI_EXPORT_SYMBOL(acpi_read_bit_register) /******************************************************************************* * - * FUNCTION: acpi_set_register + * FUNCTION: acpi_write_bit_register * * PARAMETERS: register_id - ID of ACPI Bit Register to access * Value - Value to write to the register, in bit @@ -322,14 +322,14 @@ ACPI_EXPORT_SYMBOL(acpi_get_register) * PM2 Control. * ******************************************************************************/ -acpi_status acpi_set_register(u32 register_id, u32 value) +acpi_status acpi_write_bit_register(u32 register_id, u32 value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; acpi_cpu_flags lock_flags; - ACPI_FUNCTION_TRACE_U32(acpi_set_register, register_id); + ACPI_FUNCTION_TRACE_U32(acpi_write_bit_register, register_id); /* Get the info structure corresponding to the requested ACPI Register */ @@ -459,7 +459,7 @@ acpi_status acpi_set_register(u32 register_id, u32 value) return_ACPI_STATUS(status); } -ACPI_EXPORT_SYMBOL(acpi_set_register) +ACPI_EXPORT_SYMBOL(acpi_write_bit_register) /******************************************************************************* * diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 6946047d0096..5c69e85c1e4d 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -630,7 +630,7 @@ static void acpi_processor_power_verify_c3(struct acpi_processor *pr, * In either case, the proper way to * handle BM_RLD is to set it and leave it set. */ - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); + acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 1); return; } @@ -800,9 +800,9 @@ static int acpi_idle_bm_check(void) { u32 bm_status = 0; - acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); + acpi_read_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); if (bm_status) - acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); + acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); /* * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect * the true state of bus mastering activity; forcing us to @@ -1028,7 +1028,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, c3_cpu_count++; /* Disable bus master arbitration when all CPUs are in C3 */ if (c3_cpu_count == num_online_cpus()) - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); + acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); spin_unlock(&c3_lock); } else if (!pr->flags.bm_check) { ACPI_FLUSH_CPU_CACHE(); @@ -1041,7 +1041,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev, /* Re-enable bus master arbitration */ if (pr->flags.bm_check && pr->flags.bm_control) { spin_lock(&c3_lock); - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); c3_cpu_count--; spin_unlock(&c3_lock); } diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 00456fccfa38..837ac7d5801b 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -248,7 +248,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) /* If ACPI is not enabled by the BIOS, we need to enable it here. */ if (set_sci_en_on_resume) - acpi_set_register(ACPI_BITREG_SCI_ENABLE, 1); + acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1); else acpi_enable(); diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 325d4b073aca..a5525a5518a9 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -345,9 +345,9 @@ acpi_resource_to_address64(struct acpi_resource *resource, */ acpi_status acpi_reset(void); -acpi_status acpi_get_register(u32 register_id, u32 * return_value); +acpi_status acpi_read_bit_register(u32 register_id, u32 *return_value); -acpi_status acpi_set_register(u32 register_id, u32 value); +acpi_status acpi_write_bit_register(u32 register_id, u32 value); acpi_status acpi_set_firmware_waking_vector(u32 physical_address); -- cgit v1.2.3-59-g8ed1b From 88dcb04a813265e1a5a1bc74af2db7efa7d62ee6 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Wed, 18 Feb 2009 16:01:04 +0800 Subject: ACPICA: Restructure bit register access functions Update code for acpi_read_bit_register and acpi_write_bit_register. Simplified code path, condensed duplicate code. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/hwxface.c | 177 +++++++++++++++--------------------------- 1 file changed, 62 insertions(+), 115 deletions(-) diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index 529251c7f911..caad51680bd6 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -133,8 +133,8 @@ acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg) *value = 0; /* - * Two address spaces supported: Memory or IO. - * PCI_Config is not supported here because the GAS struct is insufficient + * Two address spaces supported: Memory or IO. PCI_Config is + * not supported here because the GAS structure is insufficient */ switch (reg->space_id) { case ACPI_ADR_SPACE_SYSTEM_MEMORY: @@ -266,11 +266,12 @@ ACPI_EXPORT_SYMBOL(acpi_write) ******************************************************************************/ acpi_status acpi_read_bit_register(u32 register_id, u32 *return_value) { - u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; + u32 register_value; + u32 value; acpi_status status; - ACPI_FUNCTION_TRACE(acpi_read_bit_register); + ACPI_FUNCTION_TRACE_U32(acpi_read_bit_register, register_id); /* Get the info structure corresponding to the requested ACPI Register */ @@ -283,23 +284,22 @@ acpi_status acpi_read_bit_register(u32 register_id, u32 *return_value) status = acpi_hw_register_read(bit_reg_info->parent_register, ®ister_value); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } - if (ACPI_SUCCESS(status)) { - - /* Normalize the value that was read */ - - register_value = - ((register_value & bit_reg_info->access_bit_mask) - >> bit_reg_info->bit_position); + /* Normalize the value that was read, mask off other bits */ - *return_value = register_value; + value = ((register_value & bit_reg_info->access_bit_mask) + >> bit_reg_info->bit_position); - ACPI_DEBUG_PRINT((ACPI_DB_IO, "Read value %8.8X register %X\n", - register_value, - bit_reg_info->parent_register)); - } + ACPI_DEBUG_PRINT((ACPI_DB_IO, + "BitReg %X, ParentReg %X, Actual %8.8X, ReturnValue %8.8X\n", + register_id, bit_reg_info->parent_register, + register_value, value)); - return_ACPI_STATUS(status); + *return_value = value; + return_ACPI_STATUS(AE_OK); } ACPI_EXPORT_SYMBOL(acpi_read_bit_register) @@ -321,13 +321,16 @@ ACPI_EXPORT_SYMBOL(acpi_read_bit_register) * SUPPORTS: Bit fields in PM1 Status, PM1 Enable, PM1 Control, and * PM2 Control. * + * Note that at this level, the fact that there may be actually two + * hardware registers (A and B - and B may not exist) is abstracted. + * ******************************************************************************/ acpi_status acpi_write_bit_register(u32 register_id, u32 value) { - u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; - acpi_status status; acpi_cpu_flags lock_flags; + u32 register_value; + acpi_status status = AE_OK; ACPI_FUNCTION_TRACE_U32(acpi_write_bit_register, register_id); @@ -335,127 +338,71 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) bit_reg_info = acpi_hw_get_bit_register_info(register_id); if (!bit_reg_info) { - ACPI_ERROR((AE_INFO, "Bad ACPI HW RegisterId: %X", - register_id)); return_ACPI_STATUS(AE_BAD_PARAMETER); } lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); - /* Always do a register read first so we can insert the new bits */ - - status = acpi_hw_register_read(bit_reg_info->parent_register, - ®ister_value); - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; - } - /* - * Decode the Register ID - * Register ID = [Register block ID] | [bit ID] - * - * Check bit ID to fine locate Register offset. - * Check Mask to determine Register offset, and then read-write. + * At this point, we know that the parent register is one of the + * following: PM1 Status, PM1 Enable, PM1 Control, or PM2 Control */ - switch (bit_reg_info->parent_register) { - case ACPI_REGISTER_PM1_STATUS: - - /* - * Status Registers are different from the rest. Clear by - * writing 1, and writing 0 has no effect. So, the only relevant - * information is the single bit we're interested in, all others should - * be written as 0 so they will be left unchanged. - */ - value = ACPI_REGISTER_PREPARE_BITS(value, - bit_reg_info->bit_position, - bit_reg_info-> - access_bit_mask); - if (value) { - status = - acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, - (u16) value); - register_value = 0; - } - break; - - case ACPI_REGISTER_PM1_ENABLE: - - ACPI_REGISTER_INSERT_VALUE(register_value, - bit_reg_info->bit_position, - bit_reg_info->access_bit_mask, - value); - - status = acpi_hw_register_write(ACPI_REGISTER_PM1_ENABLE, - (u16) register_value); - break; - - case ACPI_REGISTER_PM1_CONTROL: - + if (bit_reg_info->parent_register != ACPI_REGISTER_PM1_STATUS) { /* - * Write the PM1 Control register. - * Note that at this level, the fact that there are actually TWO - * registers (A and B - and B may not exist) is abstracted. + * 1) Case for PM1 Enable, PM1 Control, and PM2 Control + * + * Perform a register read to preserve the bits that we are not + * interested in */ - ACPI_DEBUG_PRINT((ACPI_DB_IO, "PM1 control: Read %X\n", - register_value)); - - ACPI_REGISTER_INSERT_VALUE(register_value, - bit_reg_info->bit_position, - bit_reg_info->access_bit_mask, - value); - - status = acpi_hw_register_write(ACPI_REGISTER_PM1_CONTROL, - (u16) register_value); - break; - - case ACPI_REGISTER_PM2_CONTROL: - - status = acpi_hw_register_read(ACPI_REGISTER_PM2_CONTROL, + status = acpi_hw_register_read(bit_reg_info->parent_register, ®ister_value); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } - ACPI_DEBUG_PRINT((ACPI_DB_IO, - "PM2 control: Read %X from %8.8X%8.8X\n", - register_value, - ACPI_FORMAT_UINT64(acpi_gbl_FADT. - xpm2_control_block. - address))); - + /* + * Insert the input bit into the value that was just read + * and write the register + */ ACPI_REGISTER_INSERT_VALUE(register_value, bit_reg_info->bit_position, bit_reg_info->access_bit_mask, value); - ACPI_DEBUG_PRINT((ACPI_DB_IO, - "About to write %4.4X to %8.8X%8.8X\n", - register_value, - ACPI_FORMAT_UINT64(acpi_gbl_FADT. - xpm2_control_block. - address))); + status = acpi_hw_register_write(bit_reg_info->parent_register, + register_value); + } else { + /* + * 2) Case for PM1 Status + * + * The Status register is different from the rest. Clear an event + * by writing 1, writing 0 has no effect. So, the only relevant + * information is the single bit we're interested in, all others + * should be written as 0 so they will be left unchanged. + */ + register_value = ACPI_REGISTER_PREPARE_BITS(value, + bit_reg_info-> + bit_position, + bit_reg_info-> + access_bit_mask); - status = acpi_hw_register_write(ACPI_REGISTER_PM2_CONTROL, - (u8) (register_value)); - break; + /* No need to write the register if value is all zeros */ - default: - break; + if (register_value) { + status = + acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, + register_value); + } } - unlock_and_exit: - - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); - - /* Normalize the value that was read */ + ACPI_DEBUG_PRINT((ACPI_DB_IO, + "BitReg %X, ParentReg %X, Value %8.8X, Actual %8.8X\n", + register_id, bit_reg_info->parent_register, value, + register_value)); - ACPI_DEBUG_EXEC(register_value = - ((register_value & bit_reg_info->access_bit_mask) >> - bit_reg_info->bit_position)); +unlock_and_exit: - ACPI_DEBUG_PRINT((ACPI_DB_IO, - "Set bits: %8.8X actual %8.8X register %X\n", value, - register_value, bit_reg_info->parent_register)); + acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } -- cgit v1.2.3-59-g8ed1b From b6bc342dd543f40b6feccbaf6719d9f836c9964a Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 23 Feb 2009 10:26:19 +0800 Subject: ACPICA: Update table header print function Cleanup table header output. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/tbutils.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index dc45b49e5cb9..ef7d2c2d8f0b 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -177,19 +177,23 @@ acpi_tb_print_table_header(acpi_physical_address address, struct acpi_table_header *header) { + /* + * The reason that the Address is cast to a void pointer is so that we + * can use %p which will work properly on both 32-bit and 64-bit hosts. + */ if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_FACS)) { - /* FACS only has signature and length fields of common table header */ + /* FACS only has signature and length fields */ - ACPI_INFO((AE_INFO, "%4.4s %08lX, %04X", - header->signature, (unsigned long)address, + ACPI_INFO((AE_INFO, "%4.4s %p %05X", + header->signature, ACPI_CAST_PTR(void, address), header->length)); } else if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_RSDP)) { /* RSDP has no common fields */ - ACPI_INFO((AE_INFO, "RSDP %08lX, %04X (r%d %6.6s)", - (unsigned long)address, + ACPI_INFO((AE_INFO, "RSDP %p %05X (v%.2d %6.6s)", + ACPI_CAST_PTR (void, address), (ACPI_CAST_PTR(struct acpi_table_rsdp, header)-> revision > 0) ? ACPI_CAST_PTR(struct acpi_table_rsdp, @@ -202,8 +206,8 @@ acpi_tb_print_table_header(acpi_physical_address address, /* Standard ACPI table with full common header */ ACPI_INFO((AE_INFO, - "%4.4s %08lX, %04X (r%d %6.6s %8.8s %8X %4.4s %8X)", - header->signature, (unsigned long)address, + "%4.4s %p %05X (v%.2d %6.6s %8.8s %08X %4.4s %08X)", + header->signature, ACPI_CAST_PTR (void, address), header->length, header->revision, header->oem_id, header->oem_table_id, header->oem_revision, header->asl_compiler_id, -- cgit v1.2.3-59-g8ed1b From 2affa28605fa5387192c72d9889a00c9c51aa712 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Mon, 23 Feb 2009 10:27:07 +0800 Subject: ACPICA: Update version to 20090220 Version 20090220. Signed-off-by: Bob Moore Signed-off-by: Lin Ming Signed-off-by: Len Brown --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index a5525a5518a9..666182b5ca31 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -47,7 +47,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20090123 +#define ACPI_CA_VERSION 0x20090220 #include "actypes.h" #include "actbl.h" -- cgit v1.2.3-59-g8ed1b From 3341323bb4c198f704cffbfdda37bcec1226ef7d Mon Sep 17 00:00:00 2001 From: Alexander Clouter Date: Fri, 27 Mar 2009 12:59:54 +0800 Subject: hwrng: timeriomem - Use phys address rather than virt There is no ioremap'ing or anything in timeriomem-rng.c as I foolishly used already remapped virtual addresses instead of passing the physical address to be polled. This patch fixes this flaw and lets developers do the Right Thing(tm). Signed-off-by: Alexander Clouter Signed-off-by: Herbert Xu --- drivers/char/hw_random/timeriomem-rng.c | 39 +++++++++++++++++++++++++++++---- include/linux/timeriomem-rng.h | 2 +- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c index 10ad41be5897..dcd352ad0e7f 100644 --- a/drivers/char/hw_random/timeriomem-rng.c +++ b/drivers/char/hw_random/timeriomem-rng.c @@ -90,10 +90,30 @@ static struct hwrng timeriomem_rng_ops = { static int __init timeriomem_rng_probe(struct platform_device *pdev) { + struct resource *res, *mem; int ret; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + if (!res) + return -ENOENT; + + mem = request_mem_region(res->start, res->end - res->start + 1, + pdev->name); + if (mem == NULL) + return -EBUSY; + + dev_set_drvdata(&pdev->dev, mem); + timeriomem_rng_data = pdev->dev.platform_data; + timeriomem_rng_data->address = ioremap(res->start, + res->end - res->start + 1); + if (!timeriomem_rng_data->address) { + ret = -ENOMEM; + goto err_ioremap; + } + if (timeriomem_rng_data->period != 0 && usecs_to_jiffies(timeriomem_rng_data->period) > 0) { timeriomem_rng_timer.expires = jiffies; @@ -104,23 +124,34 @@ static int __init timeriomem_rng_probe(struct platform_device *pdev) timeriomem_rng_data->present = 1; ret = hwrng_register(&timeriomem_rng_ops); - if (ret) { - dev_err(&pdev->dev, "problem registering\n"); - return ret; - } + if (ret) + goto err_register; dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n", timeriomem_rng_data->address, timeriomem_rng_data->period); return 0; + +err_register: + dev_err(&pdev->dev, "problem registering\n"); + iounmap(timeriomem_rng_data->address); +err_ioremap: + release_resource(mem); + + return ret; } static int __devexit timeriomem_rng_remove(struct platform_device *pdev) { + struct resource *mem = dev_get_drvdata(&pdev->dev); + del_timer_sync(&timeriomem_rng_timer); hwrng_unregister(&timeriomem_rng_ops); + iounmap(timeriomem_rng_data->address); + release_resource(mem); + return 0; } diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h index dd253177f65f..3e08a1c86830 100644 --- a/include/linux/timeriomem-rng.h +++ b/include/linux/timeriomem-rng.h @@ -14,7 +14,7 @@ struct timeriomem_rng_data { struct completion completion; unsigned int present:1; - u32 __iomem *address; + void __iomem *address; /* measures in usecs */ unsigned int period; -- cgit v1.2.3-59-g8ed1b From f4f689933c63e0fbfba62f2a80efb2b424b139ae Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 27 Mar 2009 13:03:51 +0800 Subject: crypto: shash - Fix unaligned calculation with short length When the total length is shorter than the calculated number of unaligned bytes, the call to shash->update breaks. For example, calling crc32c on unaligned buffer with length of 1 can result in a system crash. Signed-off-by: Yehuda Sadeh Signed-off-by: Herbert Xu --- crypto/shash.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crypto/shash.c b/crypto/shash.c index 7a659733f94a..2ccc8b0076ce 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -77,6 +77,9 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data, u8 buf[shash_align_buffer_size(unaligned_len, alignmask)] __attribute__ ((aligned)); + if (unaligned_len > len) + unaligned_len = len; + memcpy(buf, data, unaligned_len); return shash->update(desc, buf, unaligned_len) ?: -- cgit v1.2.3-59-g8ed1b From 0d44dc59b2b434b29aafeae581d06f81efac7c83 Mon Sep 17 00:00:00 2001 From: Christian Hohnstaedt Date: Fri, 27 Mar 2009 15:09:05 +0800 Subject: crypto: ixp4xx - Fix handling of chained sg buffers - keep dma functions away from chained scatterlists. Use the existing scatterlist iteration inside the driver to call dma_map_single() for each chunk and avoid dma_map_sg(). Signed-off-by: Christian Hohnstaedt Tested-By: Karl Hiramoto Signed-off-by: Herbert Xu --- drivers/crypto/ixp4xx_crypto.c | 182 ++++++++++++++--------------------------- 1 file changed, 63 insertions(+), 119 deletions(-) diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index d9e751be8c5f..af9761ccf9f1 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -101,6 +101,7 @@ struct buffer_desc { u32 phys_addr; u32 __reserved[4]; struct buffer_desc *next; + enum dma_data_direction dir; }; struct crypt_ctl { @@ -132,14 +133,10 @@ struct crypt_ctl { struct ablk_ctx { struct buffer_desc *src; struct buffer_desc *dst; - unsigned src_nents; - unsigned dst_nents; }; struct aead_ctx { struct buffer_desc *buffer; - unsigned short assoc_nents; - unsigned short src_nents; struct scatterlist ivlist; /* used when the hmac is not on one sg entry */ u8 *hmac_virt; @@ -312,7 +309,7 @@ static struct crypt_ctl *get_crypt_desc_emerg(void) } } -static void free_buf_chain(struct buffer_desc *buf, u32 phys) +static void free_buf_chain(struct device *dev, struct buffer_desc *buf,u32 phys) { while (buf) { struct buffer_desc *buf1; @@ -320,6 +317,7 @@ static void free_buf_chain(struct buffer_desc *buf, u32 phys) buf1 = buf->next; phys1 = buf->phys_next; + dma_unmap_single(dev, buf->phys_next, buf->buf_len, buf->dir); dma_pool_free(buffer_pool, buf, phys); buf = buf1; phys = phys1; @@ -348,7 +346,6 @@ static void one_packet(dma_addr_t phys) struct crypt_ctl *crypt; struct ixp_ctx *ctx; int failed; - enum dma_data_direction src_direction = DMA_BIDIRECTIONAL; failed = phys & 0x1 ? -EBADMSG : 0; phys &= ~0x3; @@ -358,13 +355,8 @@ static void one_packet(dma_addr_t phys) case CTL_FLAG_PERFORM_AEAD: { struct aead_request *req = crypt->data.aead_req; struct aead_ctx *req_ctx = aead_request_ctx(req); - dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, - DMA_TO_DEVICE); - dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); - dma_unmap_sg(dev, req->src, req_ctx->src_nents, - DMA_BIDIRECTIONAL); - free_buf_chain(req_ctx->buffer, crypt->src_buf); + free_buf_chain(dev, req_ctx->buffer, crypt->src_buf); if (req_ctx->hmac_virt) { finish_scattered_hmac(crypt); } @@ -374,16 +366,11 @@ static void one_packet(dma_addr_t phys) case CTL_FLAG_PERFORM_ABLK: { struct ablkcipher_request *req = crypt->data.ablk_req; struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req); - int nents; + if (req_ctx->dst) { - nents = req_ctx->dst_nents; - dma_unmap_sg(dev, req->dst, nents, DMA_FROM_DEVICE); - free_buf_chain(req_ctx->dst, crypt->dst_buf); - src_direction = DMA_TO_DEVICE; + free_buf_chain(dev, req_ctx->dst, crypt->dst_buf); } - nents = req_ctx->src_nents; - dma_unmap_sg(dev, req->src, nents, src_direction); - free_buf_chain(req_ctx->src, crypt->src_buf); + free_buf_chain(dev, req_ctx->src, crypt->src_buf); req->base.complete(&req->base, failed); break; } @@ -750,56 +737,35 @@ static int setup_cipher(struct crypto_tfm *tfm, int encrypt, return 0; } -static int count_sg(struct scatterlist *sg, int nbytes) +static struct buffer_desc *chainup_buffers(struct device *dev, + struct scatterlist *sg, unsigned nbytes, + struct buffer_desc *buf, gfp_t flags, + enum dma_data_direction dir) { - int i; - for (i = 0; nbytes > 0; i++, sg = sg_next(sg)) - nbytes -= sg->length; - return i; -} - -static struct buffer_desc *chainup_buffers(struct scatterlist *sg, - unsigned nbytes, struct buffer_desc *buf, gfp_t flags) -{ - int nents = 0; - - while (nbytes > 0) { + for (;nbytes > 0; sg = scatterwalk_sg_next(sg)) { + unsigned len = min(nbytes, sg->length); struct buffer_desc *next_buf; u32 next_buf_phys; - unsigned len = min(nbytes, sg_dma_len(sg)); + void *ptr; - nents++; nbytes -= len; - if (!buf->phys_addr) { - buf->phys_addr = sg_dma_address(sg); - buf->buf_len = len; - buf->next = NULL; - buf->phys_next = 0; - goto next; - } - /* Two consecutive chunks on one page may be handled by the old - * buffer descriptor, increased by the length of the new one - */ - if (sg_dma_address(sg) == buf->phys_addr + buf->buf_len) { - buf->buf_len += len; - goto next; - } + ptr = page_address(sg_page(sg)) + sg->offset; next_buf = dma_pool_alloc(buffer_pool, flags, &next_buf_phys); - if (!next_buf) - return NULL; + if (!next_buf) { + buf = NULL; + break; + } + sg_dma_address(sg) = dma_map_single(dev, ptr, len, dir); buf->next = next_buf; buf->phys_next = next_buf_phys; - buf = next_buf; - buf->next = NULL; - buf->phys_next = 0; + buf->phys_addr = sg_dma_address(sg); buf->buf_len = len; -next: - if (nbytes > 0) { - sg = sg_next(sg); - } + buf->dir = dir; } + buf->next = NULL; + buf->phys_next = 0; return buf; } @@ -860,12 +826,12 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); struct ixp_ctx *ctx = crypto_ablkcipher_ctx(tfm); unsigned ivsize = crypto_ablkcipher_ivsize(tfm); - int ret = -ENOMEM; struct ix_sa_dir *dir; struct crypt_ctl *crypt; - unsigned int nbytes = req->nbytes, nents; + unsigned int nbytes = req->nbytes; enum dma_data_direction src_direction = DMA_BIDIRECTIONAL; struct ablk_ctx *req_ctx = ablkcipher_request_ctx(req); + struct buffer_desc src_hook; gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -878,7 +844,7 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) crypt = get_crypt_desc(); if (!crypt) - return ret; + return -ENOMEM; crypt->data.ablk_req = req; crypt->crypto_ctx = dir->npe_ctx_phys; @@ -891,53 +857,41 @@ static int ablk_perform(struct ablkcipher_request *req, int encrypt) BUG_ON(ivsize && !req->info); memcpy(crypt->iv, req->info, ivsize); if (req->src != req->dst) { + struct buffer_desc dst_hook; crypt->mode |= NPE_OP_NOT_IN_PLACE; - nents = count_sg(req->dst, nbytes); /* This was never tested by Intel * for more than one dst buffer, I think. */ - BUG_ON(nents != 1); - req_ctx->dst_nents = nents; - dma_map_sg(dev, req->dst, nents, DMA_FROM_DEVICE); - req_ctx->dst = dma_pool_alloc(buffer_pool, flags,&crypt->dst_buf); - if (!req_ctx->dst) - goto unmap_sg_dest; - req_ctx->dst->phys_addr = 0; - if (!chainup_buffers(req->dst, nbytes, req_ctx->dst, flags)) + BUG_ON(req->dst->length < nbytes); + req_ctx->dst = NULL; + if (!chainup_buffers(dev, req->dst, nbytes, &dst_hook, + flags, DMA_FROM_DEVICE)) goto free_buf_dest; src_direction = DMA_TO_DEVICE; + req_ctx->dst = dst_hook.next; + crypt->dst_buf = dst_hook.phys_next; } else { req_ctx->dst = NULL; - req_ctx->dst_nents = 0; } - nents = count_sg(req->src, nbytes); - req_ctx->src_nents = nents; - dma_map_sg(dev, req->src, nents, src_direction); - - req_ctx->src = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf); - if (!req_ctx->src) - goto unmap_sg_src; - req_ctx->src->phys_addr = 0; - if (!chainup_buffers(req->src, nbytes, req_ctx->src, flags)) + req_ctx->src = NULL; + if (!chainup_buffers(dev, req->src, nbytes, &src_hook, + flags, src_direction)) goto free_buf_src; + req_ctx->src = src_hook.next; + crypt->src_buf = src_hook.phys_next; crypt->ctl_flags |= CTL_FLAG_PERFORM_ABLK; qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt)); BUG_ON(qmgr_stat_overflow(SEND_QID)); return -EINPROGRESS; free_buf_src: - free_buf_chain(req_ctx->src, crypt->src_buf); -unmap_sg_src: - dma_unmap_sg(dev, req->src, req_ctx->src_nents, src_direction); + free_buf_chain(dev, req_ctx->src, crypt->src_buf); free_buf_dest: if (req->src != req->dst) { - free_buf_chain(req_ctx->dst, crypt->dst_buf); -unmap_sg_dest: - dma_unmap_sg(dev, req->src, req_ctx->dst_nents, - DMA_FROM_DEVICE); + free_buf_chain(dev, req_ctx->dst, crypt->dst_buf); } crypt->ctl_flags = CTL_FLAG_UNUSED; - return ret; + return -ENOMEM; } static int ablk_encrypt(struct ablkcipher_request *req) @@ -985,7 +939,7 @@ static int hmac_inconsistent(struct scatterlist *sg, unsigned start, break; offset += sg->length; - sg = sg_next(sg); + sg = scatterwalk_sg_next(sg); } return (start + nbytes > offset + sg->length); } @@ -997,11 +951,10 @@ static int aead_perform(struct aead_request *req, int encrypt, struct ixp_ctx *ctx = crypto_aead_ctx(tfm); unsigned ivsize = crypto_aead_ivsize(tfm); unsigned authsize = crypto_aead_authsize(tfm); - int ret = -ENOMEM; struct ix_sa_dir *dir; struct crypt_ctl *crypt; - unsigned int cryptlen, nents; - struct buffer_desc *buf; + unsigned int cryptlen; + struct buffer_desc *buf, src_hook; struct aead_ctx *req_ctx = aead_request_ctx(req); gfp_t flags = req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ? GFP_KERNEL : GFP_ATOMIC; @@ -1022,7 +975,7 @@ static int aead_perform(struct aead_request *req, int encrypt, } crypt = get_crypt_desc(); if (!crypt) - return ret; + return -ENOMEM; crypt->data.aead_req = req; crypt->crypto_ctx = dir->npe_ctx_phys; @@ -1041,31 +994,27 @@ static int aead_perform(struct aead_request *req, int encrypt, BUG(); /* -ENOTSUP because of my lazyness */ } - req_ctx->buffer = dma_pool_alloc(buffer_pool, flags, &crypt->src_buf); - if (!req_ctx->buffer) - goto out; - req_ctx->buffer->phys_addr = 0; /* ASSOC data */ - nents = count_sg(req->assoc, req->assoclen); - req_ctx->assoc_nents = nents; - dma_map_sg(dev, req->assoc, nents, DMA_TO_DEVICE); - buf = chainup_buffers(req->assoc, req->assoclen, req_ctx->buffer,flags); + buf = chainup_buffers(dev, req->assoc, req->assoclen, &src_hook, + flags, DMA_TO_DEVICE); + req_ctx->buffer = src_hook.next; + crypt->src_buf = src_hook.phys_next; if (!buf) - goto unmap_sg_assoc; + goto out; /* IV */ sg_init_table(&req_ctx->ivlist, 1); sg_set_buf(&req_ctx->ivlist, iv, ivsize); - dma_map_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); - buf = chainup_buffers(&req_ctx->ivlist, ivsize, buf, flags); + buf = chainup_buffers(dev, &req_ctx->ivlist, ivsize, buf, flags, + DMA_BIDIRECTIONAL); if (!buf) - goto unmap_sg_iv; + goto free_chain; if (unlikely(hmac_inconsistent(req->src, cryptlen, authsize))) { /* The 12 hmac bytes are scattered, * we need to copy them into a safe buffer */ req_ctx->hmac_virt = dma_pool_alloc(buffer_pool, flags, &crypt->icv_rev_aes); if (unlikely(!req_ctx->hmac_virt)) - goto unmap_sg_iv; + goto free_chain; if (!encrypt) { scatterwalk_map_and_copy(req_ctx->hmac_virt, req->src, cryptlen, authsize, 0); @@ -1075,33 +1024,28 @@ static int aead_perform(struct aead_request *req, int encrypt, req_ctx->hmac_virt = NULL; } /* Crypt */ - nents = count_sg(req->src, cryptlen + authsize); - req_ctx->src_nents = nents; - dma_map_sg(dev, req->src, nents, DMA_BIDIRECTIONAL); - buf = chainup_buffers(req->src, cryptlen + authsize, buf, flags); + buf = chainup_buffers(dev, req->src, cryptlen + authsize, buf, flags, + DMA_BIDIRECTIONAL); if (!buf) - goto unmap_sg_src; + goto free_hmac_virt; if (!req_ctx->hmac_virt) { crypt->icv_rev_aes = buf->phys_addr + buf->buf_len - authsize; } + crypt->ctl_flags |= CTL_FLAG_PERFORM_AEAD; qmgr_put_entry(SEND_QID, crypt_virt2phys(crypt)); BUG_ON(qmgr_stat_overflow(SEND_QID)); return -EINPROGRESS; -unmap_sg_src: - dma_unmap_sg(dev, req->src, req_ctx->src_nents, DMA_BIDIRECTIONAL); +free_hmac_virt: if (req_ctx->hmac_virt) { dma_pool_free(buffer_pool, req_ctx->hmac_virt, crypt->icv_rev_aes); } -unmap_sg_iv: - dma_unmap_sg(dev, &req_ctx->ivlist, 1, DMA_BIDIRECTIONAL); -unmap_sg_assoc: - dma_unmap_sg(dev, req->assoc, req_ctx->assoc_nents, DMA_TO_DEVICE); - free_buf_chain(req_ctx->buffer, crypt->src_buf); +free_chain: + free_buf_chain(dev, req_ctx->buffer, crypt->src_buf); out: crypt->ctl_flags = CTL_FLAG_UNUSED; - return ret; + return -ENOMEM; } static int aead_setup(struct crypto_aead *tfm, unsigned int authsize) -- cgit v1.2.3-59-g8ed1b From a16fffdd8eb95ebab7dc22414896fe6493951e0e Mon Sep 17 00:00:00 2001 From: Jonas Larsson Date: Fri, 27 Mar 2009 10:18:14 +0100 Subject: Add Merisc board support Merisc is the family name for a range of AVR32-based boards. The boards are designed to be used in a man-machine interfacing environment, utilizing a touch-based graphical user interface. They host a vast range of I/O peripherals as well as a large SDRAM & Flash memory bank. For more information see: http://www.martinsson.se/merisc Signed-off-by: Jonas Larsson Signed-off-by: Haavard Skinnemoen --- arch/avr32/Kconfig | 14 + arch/avr32/Makefile | 1 + arch/avr32/boards/merisc/Kconfig | 5 + arch/avr32/boards/merisc/Makefile | 1 + arch/avr32/boards/merisc/display.c | 65 ++ arch/avr32/boards/merisc/flash.c | 139 ++++ arch/avr32/boards/merisc/merisc.h | 18 + arch/avr32/boards/merisc/merisc_sysfs.c | 65 ++ arch/avr32/boards/merisc/setup.c | 289 ++++++++ arch/avr32/configs/merisc_defconfig | 1237 +++++++++++++++++++++++++++++++ 10 files changed, 1834 insertions(+) create mode 100644 arch/avr32/boards/merisc/Kconfig create mode 100644 arch/avr32/boards/merisc/Makefile create mode 100644 arch/avr32/boards/merisc/display.c create mode 100644 arch/avr32/boards/merisc/flash.c create mode 100644 arch/avr32/boards/merisc/merisc.h create mode 100644 arch/avr32/boards/merisc/merisc_sysfs.c create mode 100644 arch/avr32/boards/merisc/setup.c create mode 100644 arch/avr32/configs/merisc_defconfig diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index b189680d18b0..8cee7842d0b2 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -144,6 +144,19 @@ config BOARD_FAVR_32 bool "Favr-32 LCD-board" select CPU_AT32AP7000 +config BOARD_MERISC + bool "Merisc board" + select CPU_AT32AP7000 + help + Merisc is the family name for a range of AVR32-based boards. + + The boards are designed to be used in a man-machine + interfacing environment, utilizing a touch-based graphical + user interface. They host a vast range of I/O peripherals as + well as a large SDRAM & Flash memory bank. + + For more information see: http://www.martinsson.se/merisc + config BOARD_MIMC200 bool "MIMC200 CPU board" select CPU_AT32AP7000 @@ -153,6 +166,7 @@ source "arch/avr32/boards/atstk1000/Kconfig" source "arch/avr32/boards/atngw100/Kconfig" source "arch/avr32/boards/hammerhead/Kconfig" source "arch/avr32/boards/favr-32/Kconfig" +source "arch/avr32/boards/merisc/Kconfig" choice prompt "Boot loader type" diff --git a/arch/avr32/Makefile b/arch/avr32/Makefile index f3ef3bbf797c..0b97e14f73f6 100644 --- a/arch/avr32/Makefile +++ b/arch/avr32/Makefile @@ -35,6 +35,7 @@ core-$(CONFIG_BOARD_ATSTK1000) += arch/avr32/boards/atstk1000/ core-$(CONFIG_BOARD_ATNGW100) += arch/avr32/boards/atngw100/ core-$(CONFIG_BOARD_HAMMERHEAD) += arch/avr32/boards/hammerhead/ core-$(CONFIG_BOARD_FAVR_32) += arch/avr32/boards/favr-32/ +core-$(CONFIG_BOARD_MERISC) += arch/avr32/boards/merisc/ core-$(CONFIG_BOARD_MIMC200) += arch/avr32/boards/mimc200/ core-$(CONFIG_LOADER_U_BOOT) += arch/avr32/boot/u-boot/ core-y += arch/avr32/kernel/ diff --git a/arch/avr32/boards/merisc/Kconfig b/arch/avr32/boards/merisc/Kconfig new file mode 100644 index 000000000000..7e043275d5a9 --- /dev/null +++ b/arch/avr32/boards/merisc/Kconfig @@ -0,0 +1,5 @@ +# Merisc customization + +if BOARD_MERISC + +endif # BOARD_MERISC diff --git a/arch/avr32/boards/merisc/Makefile b/arch/avr32/boards/merisc/Makefile new file mode 100644 index 000000000000..d24c78729bd1 --- /dev/null +++ b/arch/avr32/boards/merisc/Makefile @@ -0,0 +1 @@ +obj-y += setup.o flash.o display.o merisc_sysfs.o diff --git a/arch/avr32/boards/merisc/display.c b/arch/avr32/boards/merisc/display.c new file mode 100644 index 000000000000..85a543cd4abc --- /dev/null +++ b/arch/avr32/boards/merisc/display.c @@ -0,0 +1,65 @@ +/* + * Display setup code for the Merisc board + * + * Copyright (C) 2008 Martinsson Elektronik AB + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include