From b332828c39326b1dca617f387dd15d12e81cd5f0 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:10 +0530 Subject: hw-breakpoints: prepare the code for Hardware Breakpoint interfaces The generic hardware breakpoint interface provides an abstraction of hardware breakpoints in front of specific arch implementations for both kernel and user side breakpoints. This includes execution breakpoints and read/write breakpoints, also known as "watchpoints". This patch introduces header files containing constants, structure definitions and declaration of functions used by the hardware breakpoint core and x86 specific code. It also introduces an array based storage for the debug-register values in 'struct thread_struct', while modifying all users of debugreg member in the structure. [ Impact: add headers for new hardware breakpoint interface ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 16 ++++++++-------- arch/x86/kernel/ptrace.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fb5dfb891f0f..291527cb438a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -106,10 +106,10 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; + tsk->thread.debugreg[0] = 0; + tsk->thread.debugreg[1] = 0; + tsk->thread.debugreg[2] = 0; + tsk->thread.debugreg[3] = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); @@ -194,10 +194,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); /* no 4 and 5 */ set_debugreg(next->debugreg6, 6); set_debugreg(next->debugreg7, 7); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c13..313be40be55a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -471,10 +471,10 @@ static int genregs_set(struct task_struct *target, static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) { switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; + case 0: return child->thread.debugreg[0]; + case 1: return child->thread.debugreg[1]; + case 2: return child->thread.debugreg[2]; + case 3: return child->thread.debugreg[3]; case 6: return child->thread.debugreg6; case 7: return child->thread.debugreg7; } @@ -493,10 +493,10 @@ static int ptrace_set_debugreg(struct task_struct *child, return -EIO; switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + case 0: child->thread.debugreg[0] = data; break; + case 1: child->thread.debugreg[1] = data; break; + case 2: child->thread.debugreg[2] = data; break; + case 3: child->thread.debugreg[3] = data; break; case 6: if ((data & ~0xffffffffUL) != 0) -- cgit v1.2.3-59-g8ed1b From 0067f1297241ea567f2b22a455519752d70fcca9 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:57 +0530 Subject: hw-breakpoints: x86 architecture implementation of Hardware Breakpoint interfaces This patch introduces the arch-specific implementation of the generic hardware breakpoints in kernel/hw_breakpoint.c inside x86 specific directories. It contains functions which help to validate and serve requests using Hardware Breakpoint registers on x86 processors. [ fweisbec@gmail.com: fix conflict against kmemcheck ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/Kconfig | 1 + arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/hw_breakpoint.c | 382 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 384 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/hw_breakpoint.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index df9e885eee14..3033375ed6bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_HW_BREAKPOINT config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77df4d654ff9..cbc781829173 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..4867c9f3b5fb --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,382 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Unmasked kernel DR7 value */ +static unsigned long kdr7; + +/* + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. + * Used to clear and verify the status of bits corresponding to DR0 - DR3 + */ +static const unsigned long dr7_masks[HBP_NUM] = { + 0x000f0003, /* LEN0, R/W0, G0, L0 */ + 0x00f0000c, /* LEN1, R/W1, G1, L1 */ + 0x0f000030, /* LEN2, R/W2, G2, L2 */ + 0xf00000c0 /* LEN3, R/W3, G3, L3 */ +}; + + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | + DR_GLOBAL_SLOWDOWN; + return bp_info; +} + +void arch_update_kernel_hw_breakpoint(void *unused) +{ + struct hw_breakpoint *bp; + int i, cpu = get_cpu(); + unsigned long temp_kdr7 = 0; + + /* Don't allow debug exceptions while we update the registers */ + set_debugreg(0UL, 7); + + for (i = hbp_kernel_pos; i < HBP_NUM; i++) { + per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; + if (bp) { + temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); + set_debugreg(bp->info.address, i); + } + } + + /* No need to set DR6. Update the debug registers with kernel-space + * breakpoint values from kdr7 and user-space requests from the + * current process + */ + kdr7 = temp_kdr7; + set_debugreg(kdr7 | current->thread.debugreg7, 7); + put_cpu_no_resched(); +} + +/* + * Install the thread breakpoints in their debug registers. + */ +void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + + switch (hbp_kernel_pos) { + case 4: + set_debugreg(thread->debugreg[3], 3); + case 3: + set_debugreg(thread->debugreg[2], 2); + case 2: + set_debugreg(thread->debugreg[1], 1); + case 1: + set_debugreg(thread->debugreg[0], 0); + default: + break; + } + + /* No need to set DR6 */ + set_debugreg((kdr7 | thread->debugreg7), 7); +} + +/* + * Install the debug register values for just the kernel, no thread. + */ +void arch_uninstall_thread_hw_breakpoint() +{ + /* Clear the user-space portion of debugreg7 by setting only kdr7 */ + set_debugreg(kdr7, 7); + +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case HW_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case HW_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case HW_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +{ + /* + * User-space requests will always have the address field populated + * Symbol names from user-space are rejected + */ + if (tsk && bp->info.name) + return -EINVAL; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (bp->info.name) + bp->info.address = (unsigned long) + kallsyms_lookup_name(bp->info.name); + if (bp->info.address) + return 0; + return -EINVAL; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk) +{ + unsigned int align; + int ret = -EINVAL; + + switch (bp->info.type) { + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + case HW_BREAKPOINT_EXECUTE: + if ((!arch_check_va_in_userspace(bp->info.address, + bp->info.len)) && + bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) + return ret; + break; + case HW_BREAKPOINT_WRITE: + break; + case HW_BREAKPOINT_RW: + break; + default: + return ret; + } + + switch (bp->info.len) { + case HW_BREAKPOINT_LEN_1: + align = 0; + break; + case HW_BREAKPOINT_LEN_2: + align = 1; + break; + case HW_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->triggered) + ret = arch_store_info(bp, tsk); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (bp->info.address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(bp->info.address, + bp->info.len)) + return -EFAULT; + } + return 0; +} + +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + struct hw_breakpoint *bp = thread->hbp[pos]; + + thread->debugreg7 &= ~dr7_masks[pos]; + if (bp) { + thread->debugreg[pos] = bp->info.address; + thread->debugreg7 |= encode_dr7(pos, bp->info.len, + bp->info.type); + } else + thread->debugreg[pos] = 0; +} + +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *thread = &(tsk->thread); + + thread->debugreg7 = 0; + for (i = 0; i < HBP_NUM; i++) + thread->debugreg[i] = 0; +} + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct hw_breakpoint *bp; + /* The DR6 value is stored in args->err */ + unsigned long dr7, dr6 = args->err; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Lazy debug register switching */ + if (!test_tsk_thread_flag(current, TIF_DEBUG)) + arch_uninstall_thread_hw_breakpoint(); + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + /* + * Find the corresponding hw_breakpoint structure and + * invoke its triggered callback. + */ + if (i >= hbp_kernel_pos) + bp = per_cpu(this_hbp_kernel[i], cpu); + else { + bp = current->thread.hbp[i]; + if (bp) + rc = NOTIFY_DONE; + } + /* + * bp can be NULL due to lazy debug register switching + * or due to the delay between updates of hbp_kernel_pos + * and this_hbp_kernel. + */ + if (!bp) + continue; + + (bp->triggered)(bp, args->regs); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu_no_resched(); + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} -- cgit v1.2.3-59-g8ed1b From 08d68323d1f0c34452e614263b212ca556dae47f Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:08 +0530 Subject: hw-breakpoints: modifying generic debug exception to use thread-specific debug registers This patch modifies the breakpoint exception handler code to use the new abstract debug register names. [ fweisbec@gmail.com: fix conflict against kmemcheck ] [ Impact: refactor and cleanup x86 debug exception handler ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 69 +++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..de9913247dd0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,73 +529,52 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } -- cgit v1.2.3-59-g8ed1b From 1e3500666f7c5daaadadb8431a2927cdbbdb7dd4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:26 +0530 Subject: hw-breakpoints: use wrapper routines around debug registers in processor related functions This patch enables the use of wrapper routines to access the debug/breakpoint registers on cpu management. The hardcoded debug registers save and restore operations for threads breakpoints are replaced by wrappers. And now that we handle the kernel breakpoints too, we also need to handle them on cpu hotplug operations. [ Impact: adapt new hardware breakpoint api to cpu hotplug ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/smpboot.c | 3 +++ arch/x86/power/cpu_32.c | 13 +++---------- arch/x86/power/cpu_64.c | 12 +++--------- 3 files changed, 9 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..2b2652d205c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -326,6 +327,7 @@ notrace static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); wmb(); + load_debug_registers(); cpu_idle(); } @@ -1250,6 +1252,7 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + hw_breakpoint_disable(); } int native_cpu_disable(void) diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 519913948003..2bc3b016de90 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -13,6 +13,7 @@ #include #include #include +#include static struct saved_context saved_context; @@ -48,6 +49,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4_safe(); + hw_breakpoint_disable(); } /* Needed by apm.c */ @@ -83,16 +85,7 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7) { - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } - + load_debug_registers(); } static void __restore_processor_state(struct saved_context *ctxt) diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 1e3bdcc959ff..46866a13a93a 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -16,6 +16,7 @@ #include #include #include +#include static void fix_processor_context(void); @@ -71,6 +72,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr3 = read_cr3(); ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); + hw_breakpoint_disable(); } void save_processor_state(void) @@ -162,13 +164,5 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg7){ - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - /* no 4 and 5 */ - loaddebug(¤t->thread, 6); - loaddebug(¤t->thread, 7); - } + load_debug_registers(); } -- cgit v1.2.3-59-g8ed1b From 66cb5917295958652ff6ba36d83f98f2379c46b4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:55 +0530 Subject: hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code This patch enables the use of abstract debug registers in process-handling routines, according to the new hardware breakpoint Api. [ Impact: adapt thread breakpoints handling code to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 22 ++++++---------------- arch/x86/kernel/process_32.c | 28 ++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 291527cb438a..19a686c401b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -46,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -106,12 +110,8 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg[0] = 0; - tsk->thread.debugreg[1] = 0; - tsk->thread.debugreg[2] = 0; - tsk->thread.debugreg[3] = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -193,16 +193,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b5e4bfef4472..297ffff2ffc2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -265,7 +267,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -285,10 +293,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); +out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -427,6 +438,23 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a1a1de292ec..f7b276d4b3fb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include asmlinkage extern void ret_from_fork(void); @@ -248,6 +250,8 @@ void release_thread(struct task_struct *dead_task) BUG(); } } + if (unlikely(dead_task->thread.debugreg7)) + flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -303,12 +307,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(me, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -347,6 +357,9 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); + return err; } @@ -492,6 +505,24 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); + return prev_p; } -- cgit v1.2.3-59-g8ed1b From da0cdc14f5f7e0faee6b2393fefed056cdb17146 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:03 +0530 Subject: hw-breakpoints: modify signal handling code to refrain from re-enabling HW Breakpoints This patch disables re-enabling of Hardware Breakpoint registers through the signal handling code. This is now done during from hw_breakpoint_handler(). Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/signal.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..f33d2e0ef095 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -800,15 +800,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* -- cgit v1.2.3-59-g8ed1b From 72f674d203cd230426437cdcf7dd6f681dad8b0d Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:48 +0530 Subject: hw-breakpoints: modify Ptrace routines to access breakpoint registers This patch modifies the ptrace code to use the new wrapper routines around the debug/breakpoint registers. [ Impact: adapt x86 ptrace to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Signed-off-by: Maneesh Soni Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 231 +++++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 90 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 313be40be55a..b457f78b7dbf 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -136,11 +137,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -265,15 +261,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -464,95 +451,159 @@ static int genregs_set(struct task_struct *target, } /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, + unsigned *type) { - switch (n) { - case 0: return child->thread.debugreg[0]; - case 1: return child->thread.debugreg[1]; - case 2: return child->thread.debugreg[2]; - case 3: return child->thread.debugreg[3]; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; - } - return 0; + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) { + struct thread_struct *thread = &(current->thread); int i; - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < hbp_kernel_pos; i++) + /* + * We will check bp->info.address against the address stored in + * thread's hbp structure and not debugreg[i]. This is to ensure + * that the corresponding bit for 'i' in DR7 register is enabled + */ + if (bp->info.address == thread->hbp[i]->info.address) + break; - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + thread->debugreg6 |= (DR_TRAP0 << i); +} - switch (n) { - case 0: child->thread.debugreg[0] = data; break; - case 1: child->thread.debugreg[1] = data; break; - case 2: child->thread.debugreg[2] = data; break; - case 3: child->thread.debugreg[3] = data; break; +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7 = thread->debugreg7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct hw_breakpoint *bp; + + data &= ~DR_CONTROL_RESERVED; +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->hbp[i]; + + if (!enabled) { + if (bp) { + /* Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + unregister_user_hw_breakpoint(tsk, bp); + kfree(bp); + } + continue; + } + if (!bp) { + rc = -ENOMEM; + bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); + if (bp) { + bp->info.address = thread->debugreg[i]; + bp->triggered = ptrace_triggered; + bp->info.len = len; + bp->info.type = type; + rc = register_user_hw_breakpoint(tsk, bp); + if (rc) + kfree(bp); + } + } else + rc = modify_user_hw_breakpoint(tsk, bp); + if (rc) + break; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) + val = thread->debugreg[n]; + else if (n == 6) + val = thread->debugreg6; + else if (n == 7) + val = thread->debugreg7; + return val; +} - case 7: - /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) - */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + tsk->thread.debugreg6 = val; + goto ret_path; } + if (n < HBP_NUM) { + if (thread->hbp[n]) { + if (arch_check_va_in_userspace(val, + thread->hbp[n]->info.len) == 0) { + rc = -EIO; + goto ret_path; + } + thread->hbp[n]->info.address = val; + } + thread->debugreg[n] = val; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); - return 0; +ret_path: + return rc; } /* -- cgit v1.2.3-59-g8ed1b From 17f557e5b5d43a2af66c969f6560ac7105020672 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:03 +0530 Subject: hw-breakpoints: cleanup HW Breakpoint registers before kexec This patch disables Hardware breakpoints before doing a 'kexec' on the machine so that the cpu doesn't keep debug registers values which would be out of sync for the new image. Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/machine_kexec_32.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include #include #include +#include static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include #include #include +#include static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.3-59-g8ed1b From 62edab9056a6cf0c9207339c8892c923a5217e45 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:47:06 +0530 Subject: hw-breakpoints: reset bits in dr6 after the corresponding exception is handled This patch resets the bit in dr6 after the corresponding exception is handled in code, so that we keep a clean track of the current virtual debug status register. [ Impact: keep track of breakpoints triggering completion ] Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- arch/x86/kernel/kgdb.c | 6 ++++++ arch/x86/kernel/kprobes.c | 9 ++++++++- arch/x86/kernel/traps.c | 4 ++-- arch/x86/mm/kmmio.c | 8 +++++++- 5 files changed, 34 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4867c9f3b5fb..69451473dbd2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -314,8 +314,12 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; - /* The DR6 value is stored in args->err */ - unsigned long dr7, dr6 = args->err; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) @@ -351,6 +355,11 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) if (bp) rc = NOTIFY_DONE; } + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching * or due to the delay between updates of hbp_kernel_pos diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..f820b73c7f28 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..b5b1848c5336 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -54,6 +54,7 @@ #include #include #include +#include void jprobe_return_end(void); @@ -967,8 +968,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de9913247dd0..124a4d5a95b2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -545,8 +545,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 16ccbd77917f..11a4ad4d6253 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) struct die_args *arg = args; if (val == DIE_DEBUG && (arg->err & DR_STEP)) - if (post_kmmio_handler(arg->err, arg->regs) == 1) + if (post_kmmio_handler(arg->err, arg->regs) == 1) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP; return NOTIFY_STOP; + } return NOTIFY_DONE; } -- cgit v1.2.3-59-g8ed1b From 4555835b707d5c778ee1c9076670bc99b1eeaf61 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 17 Jun 2009 14:44:19 +0530 Subject: x86: hw_breakpoint.c arch_check_va_in_kernelspace and hw_breakpoint_handler should be static arch_check_va_in_kernelspace() and hw_breakpoint_handler() is used only by same file so it should be static. Also fixed non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' Fixed following sparse warnings : arch/x86/kernel/hw_breakpoint.c:124:42: warning: non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' arch/x86/kernel/hw_breakpoint.c:169:5: warning: symbol 'arch_check_va_in_kernelspace' was not declared. Should it be static? arch/x86/kernel/hw_breakpoint.c:313:15: warning: symbol 'hw_breakpoint_handler' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Alan Stern Cc: "K.Prasad" Cc: Frederic Weisbecker LKML-Reference: <1245230059.2662.4.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 51d959528b1d..9316a9de4de3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -121,7 +121,7 @@ void arch_install_thread_hw_breakpoint(struct task_struct *tsk) /* * Install the debug register values for just the kernel, no thread. */ -void arch_uninstall_thread_hw_breakpoint() +void arch_uninstall_thread_hw_breakpoint(void) { /* Clear the user-space portion of debugreg7 by setting only kdr7 */ set_debugreg(kdr7, 7); @@ -166,7 +166,7 @@ int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) /* * Check for virtual address in kernel space. */ -int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) { unsigned int len; @@ -310,7 +310,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) * NOTIFY_STOP returned for all other cases * */ -int __kprobes hw_breakpoint_handler(struct die_args *args) +static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; -- cgit v1.2.3-59-g8ed1b From 9d22b536609abf0d64648f99518676ea58245e3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:52:30 +0530 Subject: x86: Mark ptrace_get_debugreg() as static This sparse warning: arch/x86/kernel/ptrace.c:560:15: warning: symbol 'ptrace_get_debugreg' was not declared. Should it be static? triggers because ptrace_get_debugreg() is global but is only used in a single .c file. change ptrace_get_debugreg() to static to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Steven Rostedt LKML-Reference: <1246458150.6940.19.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b457f78b7dbf..cabdabce3cb2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -557,7 +557,7 @@ restore: /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ -unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; -- cgit v1.2.3-59-g8ed1b From b46b3d70c9c017d7c4ec49f7f3ffd0af5a622277 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:28 -0400 Subject: kprobes: Checks probe address is instruction boudary on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure safeness of inserting kprobes by checking whether the specified address is at the first byte of an instruction on x86. This is done by decoding probed function from its head to the probe point. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203428.31965.21939.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..aa15f3e1f64b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,12 +48,14 @@ #include #include #include +#include #include #include #include #include #include +#include void jprobe_return_end(void); @@ -244,6 +246,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -359,6 +430,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) -- cgit v1.2.3-59-g8ed1b From 89ae465b0ee470f7d3f8a1c61353445c3acbbe2a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:36 -0400 Subject: kprobes: Cleanup fix_riprel() using insn decoder on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using the new x86 instruction decoder instead of using comparisons with raw ad hoc numeric opcodes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203436.31965.34374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 128 +++++++++------------------------------------- 1 file changed, 23 insertions(+), 105 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index aa15f3e1f64b..16ae9610f6ff 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -108,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -348,68 +304,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } -- cgit v1.2.3-59-g8ed1b From b1cf540f0e5278ecfe8532557e547d833ed269d7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:44 -0400 Subject: x86: Add pt_regs register and stack access APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add following APIs for accessing registers and stack entries from pt_regs. These APIs are required by kprobes-based event tracer on ftrace. Some other debugging tools might be able to use it too. - regs_query_register_offset(const char *name) Query the offset of "name" register. - regs_query_register_name(unsigned int offset) Query the name of register by its offset. - regs_get_register(struct pt_regs *regs, unsigned int offset) Get the value of a register by its offset. - regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) Check the address is in the kernel stack. - regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned int nth) Get Nth entry of the kernel stack. (N >= 0) - regs_get_argument_nth(struct pt_regs *reg, unsigned int nth) Get Nth argument at function call. (N >= 0) Signed-off-by: Masami Hiramatsu Cc: linux-arch@vger.kernel.org Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203444.31965.26374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/ptrace.h | 62 +++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 112 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 0f0d908349aa..a3d49dd7d26e 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include +#include #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,67 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int regs_query_register_offset(const char *name); +extern const char *regs_query_register_name(unsigned int offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten. + * @offset: offset number of the register. + * + * regs_get_register returns the value of a register whose offset from @regs + * is @offset. The @offset is the offset of the register in struct pt_regs. + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/** + * regs_within_kernel_stack() - check the address in the stack + * @regs: pt_regs which contains kernel stack pointer. + * @addr: address which is checked. + * + * regs_within_kenel_stack() checks @addr is within the kernel stack page(s). + * If @addr is within the kernel stack, it returns true. If not, returns false. + */ +static inline int regs_within_kernel_stack(struct pt_regs *regs, + unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +extern unsigned long regs_get_argument_nth(struct pt_regs *regs, + unsigned int n); + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..a33a17d5d5c8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -49,6 +49,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *((unsigned long *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- cgit v1.2.3-59-g8ed1b From 24851d2447830e6cba4c4b641cb73e713f312373 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 26 Aug 2009 23:38:30 +0200 Subject: tracing/kprobes: Dump the culprit kprobe in case of kprobe recursion Kprobes can enter into a probing recursion, ie: a kprobe that does an endless loop because one of its core mechanism function used during probing is also probed itself. This patch helps pinpointing the kprobe that raised such recursion by dumping it and raising a BUG instead of a warning (we also disarm the kprobe to try avoiding recursion in BUG itself). Having a BUG instead of a warning stops the stacktrace in the right place and doesn't pollute the logs with hundreds of traces that eventually end up in a stack overflow. Signed-off-by: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli --- arch/x86/kernel/kprobes.c | 8 ++++++-- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 7 +++++++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 16ae9610f6ff..ecee3d23fef8 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -490,9 +490,13 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* A probe has been hit in the codepath leading up * to, or just after, single-stepping of a probed * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. + * reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless + * reentering loop and eventually a stack overflow. */ + arch_disarm_kprobe(p); + dump_kprobe(p); + BUG(); } default: /* impossible cases */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bcd9c07848be..87eb79c9dd60 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -296,6 +296,8 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); int disable_kprobe(struct kprobe *kp); int enable_kprobe(struct kprobe *kp); +void dump_kprobe(struct kprobe *kp); + #else /* !CONFIG_KPROBES: */ static inline int kprobes_built_in(void) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ef177d653b2c..f72e96c25a38 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1141,6 +1141,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) arch_remove_kprobe(p); } +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + /* Module notifier call back, checking kprobes on the module */ static int __kprobes kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) -- cgit v1.2.3-59-g8ed1b From e9afe9e1b3fdbd56cca53959a2519e70db9c8095 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:22:58 -0400 Subject: kprobes/x86: Call BUG() when reentering probe into KPROBES_HIT_SS Call BUG() when a probe have been hit on the way of kprobe processing path, because that kind of probes are currently unrecoverable (recovering it will cause an infinite loop and stack overflow). The original code seems to assume that it's caused by an int3 which another subsystem inserted on out-of-line singlestep buffer if the hitting probe is same as current probe. However, in that case, int3-hitting-address is on the out-of-line buffer and should be different from first (current) int3 address. Thus, I decided to remove the code. I also removes arch_disarm_kprobe() because it will involve other stuffs in text_poke(). Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172258.8246.61889.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index ecee3d23fef8..e0fb615ba1e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -482,22 +482,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. - * Raise a BUG or we'll continue in an endless - * reentering loop and eventually a stack overflow. - */ - arch_disarm_kprobe(p); - dump_kprobe(p); - BUG(); - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); -- cgit v1.2.3-59-g8ed1b From f5ad31158d60946b9fd18c8a79c283a6bc432430 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:04 -0400 Subject: kprobes/x86-64: Allow to reenter probe on post_handler Allow to reenter probe on the post_handler of another probe on x86-64, because x86-64 already allows reentering int3. In that case, reentered probe just increases kp.nmissed and returns. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172304.8246.4822.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e0fb615ba1e9..c5f1f117e0c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -463,17 +463,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); -- cgit v1.2.3-59-g8ed1b From 8222d718b3ad3ae49c48f69ae4b6a1128c9a92cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:25 -0400 Subject: kprobes/x86-64: Fix to move common_interrupt to .kprobes.text Since nmi, debug and int3 returns to irq_return inside common_interrupt, probing this function will cause int3-loop, so it should be marked as __kprobes. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172325.8246.40000.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be745107..36e2ef5cc83f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -809,6 +809,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -947,6 +951,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. -- cgit v1.2.3-59-g8ed1b From a00e817f42663941ea0aa5f85a9d1c4f8b212839 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Sep 2009 12:47:55 -0400 Subject: kprobes/x86-32: Move irq-exit functions to kprobes section Move irq-exit functions to .kprobes.text section to protect against kprobes recursion. When I ran kprobe stress test on x86-32, I found below symbols cause unrecoverable recursive probing: ret_from_exception ret_from_intr check_userspace restore_all restore_all_notrace restore_nocheck irq_return And also, I found some interrupt/exception entry points that cause similar problems. This patch moves those symbols (including their container functions) to .kprobes.text section to prevent any kprobes probing. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ingo Molnar LKML-Reference: <20090908164755.24050.81182.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_32.S | 24 ++++++++++++++++++++++++ kernel/kprobes.c | 2 ++ 2 files changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..beb30da203d6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -333,6 +333,10 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +/* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3267d90bc9d6..00d01b0f9fee 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -91,6 +91,8 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) static struct kprobe_blackpoint kprobe_blacklist[] = { {"preempt_schedule",}, {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, {NULL} /* Terminator */ }; -- cgit v1.2.3-59-g8ed1b From ad5cafcdb09c57008c990edd309c0a563b09f238 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:06 -0400 Subject: x86/ptrace: Fix regs_get_argument_nth() to add correct offset Fix regs_get_argument_nth() to add correct offset bytes. Because offset_of() returns offset in byte, the offset should be added to char * instead of unsigned long *. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235306.22412.31613.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a33a17d5d5c8..caffb6809452 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -150,7 +150,7 @@ static const int arg_offs_table[] = { unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) { if (n < ARRAY_SIZE(arg_offs_table)) - return *((unsigned long *)regs + arg_offs_table[n]); + return *(unsigned long *)((char *)regs + arg_offs_table[n]); else { /* * The typical case: arg n is on the stack. -- cgit v1.2.3-59-g8ed1b From 04a705df47d1ea27ca2b066f24b1951c51792d0d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:08 +0200 Subject: perf_events: Check for filters on fixed counter events Intel fixed counters do not support all the filters possible with a generic counter. Thus, if a fixed counter event is passed but with certain filters set, then the fixed_mode_idx() function must fail and the event must be measured in a generic counter instead. Reject filters are: inv, edge, cnt-mask. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-2-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_event.h | 13 ++++++++++++- arch/x86/kernel/cpu/perf_event.c | 6 ++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index ad7ce3fd5065..8d9f8548a870 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -28,9 +28,20 @@ */ #define ARCH_PERFMON_EVENT_MASK 0xffff +/* + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define ARCH_PERFMON_EVENT_FILTER_MASK 0xff840000 + #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..1d16bd69551e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1349,6 +1349,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) if (!x86_pmu.num_events_fixed) return -1; + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) -- cgit v1.2.3-59-g8ed1b From b690081d4d3f6a23541493f1682835c3cd5c54a1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:09 +0200 Subject: perf_events: Add event constraints support for Intel processors On some Intel processors, not all events can be measured in all counters. Some events can only be measured in one particular counter, for instance. Assigning an event to the wrong counter does not crash the machine but this yields bogus counts, i.e., silent error. This patch changes the event to counter assignment logic to take into account event constraints for Intel P6, Core and Nehalem processors. There is no contraints on Intel Atom. There are constraints on Intel Yonah (Core Duo) but they are not provided in this patch given that this processor is not yet supported by perf_events. As a result of the constraints, it is possible for some event groups to never actually be loaded onto the PMU if they contain two events which can only be measured on a single counter. That situation can be detected with the scaling information extracted with read(). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-3-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 109 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1d16bd69551e..9c758548a0e6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -77,6 +77,18 @@ struct cpu_hw_events { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -102,6 +114,7 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -110,6 +123,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraint; + /* * Not sure about some of these */ @@ -155,6 +170,16 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,6 +195,35 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -932,6 +986,8 @@ static int __hw_perf_event_init(struct perf_event *event) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -1365,6 +1421,45 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) return -1; } +/* + * generic counter allocator: get next free counter + */ +static int gen_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int intel_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraint) + goto skip; + + code = hwc->config & 0xff; + + for_each_event_constraint(event_constraint, event_constraint) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(hwc); +} + /* * Find a PMC slot for the freshly enabled / scheduled in event: */ @@ -1402,11 +1497,10 @@ static int x86_pmu_enable(struct perf_event *event) } else { idx = hwc->idx; /* Try to get the previous generic event again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_events); - if (idx == x86_pmu.num_events) + idx = x86_pmu.get_event_idx(hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); @@ -1883,6 +1977,7 @@ static struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu intel_pmu = { @@ -1906,6 +2001,7 @@ static struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu amd_pmu = { @@ -1926,6 +2022,7 @@ static struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; static int p6_pmu_init(void) @@ -1938,10 +2035,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraint = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraint = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2013,12 +2112,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraint = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraint = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.3-59-g8ed1b From fe9081cc9bdabb0be953a39ad977cea14e35bce5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2009 11:56:07 +0200 Subject: perf, x86: Add simple group validation Refuse to add events when the group wouldn't fit onto the PMU anymore. Naive implementation. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <1254911461.26976.239.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 90 ++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9c758548a0e6..9961d845719d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -114,7 +114,8 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct hw_perf_event *hwc); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -523,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -1390,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { unsigned int hw_event; @@ -1424,9 +1424,9 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) /* * generic counter allocator: get next free counter */ -static int gen_get_event_idx(struct hw_perf_event *hwc) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); @@ -1436,16 +1436,16 @@ static int gen_get_event_idx(struct hw_perf_event *hwc) /* * intel-specific counter allocator: check event constraints */ -static int intel_get_event_idx(struct hw_perf_event *hwc) +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); const struct event_constraint *event_constraint; int i, code; if (!event_constraint) goto skip; - code = hwc->config & 0xff; + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; for_each_event_constraint(event_constraint, event_constraint) { if (code == event_constraint->code) { @@ -1457,26 +1457,22 @@ static int intel_get_event_idx(struct hw_perf_event *hwc) } } skip: - return gen_get_event_idx(hwc); + return gen_get_event_idx(cpuc, hwc); } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(event, hwc); + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->event_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* @@ -1499,17 +1495,33 @@ static int x86_pmu_enable(struct perf_event *event) /* Try to get the previous generic event again */ if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = x86_pmu.get_event_idx(hwc); + idx = x86_pmu.get_event_idx(cpuc, hwc); if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); @@ -2212,11 +2224,47 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event); +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } if (err) { if (event->destroy) event->destroy(event); -- cgit v1.2.3-59-g8ed1b From fb2531953fd8855abdcf458459020fd382c5deca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 Oct 2009 13:20:38 +0200 Subject: mce, edac: Use an atomic notifier for MCEs decoding Add an atomic notifier which ensures proper locking when conveying MCE info to EDAC for decoding. The actual notifier call overrides a default, negative priority notifier. Note: make sure we register the default decoder only once since mcheck_init() runs on each CPU. Signed-off-by: Borislav Petkov LKML-Reference: <20091003065752.GA8935@liondog.tnic> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 3 ++- arch/x86/kernel/cpu/mcheck/mce.c | 29 ++++++++++++++++++++--------- drivers/edac/edac_mce_amd.c | 21 ++++++++++++--------- 3 files changed, 34 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f1363b72364f..227a72df6441 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -108,6 +108,8 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) +extern struct atomic_notifier_head x86_mce_decoder_chain; + #ifdef __KERNEL__ #include @@ -213,6 +215,5 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..15ba9c972d7a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -85,18 +85,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -static void default_decode_mce(struct mce *m) +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { pr_emerg("No human readable MCE decoding support on this CPU type.\n"); pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; } -/* - * CPU/chipset specific EDAC code can register a callback here to print - * MCE errors in a human-readable form: - */ -void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; -EXPORT_SYMBOL(x86_mce_decode_callback); +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -204,9 +212,9 @@ static void print_mce(struct mce *m) /* * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that): + * (if the CPU has an implementation for that) */ - x86_mce_decode_callback(m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) @@ -1420,6 +1428,9 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_cpu_features(c); mce_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + + if (raw_smp_processor_id() == 0) + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 713ed7d37247..689cc6a6214d 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -3,7 +3,6 @@ static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); -static void (*orig_mce_callback)(struct mce *m); void amd_report_gart_errors(bool v) { @@ -363,8 +362,10 @@ static inline void amd_decode_err_code(unsigned int ec) pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -static void amd_decode_mce(struct mce *m) +static int amd_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { + struct mce *m = (struct mce *)data; struct err_regs regs; int node, ecc; @@ -420,20 +421,22 @@ static void amd_decode_mce(struct mce *m) } amd_decode_err_code(m->status & 0xffff); + + return NOTIFY_STOP; } +static struct notifier_block amd_mce_dec_nb = { + .notifier_call = amd_decode_mce, +}; + static int __init mce_amd_init(void) { /* * We can decode MCEs for Opteron and later CPUs: */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && - (boot_cpu_data.x86 >= 0xf)) { - /* safe the default decode mce callback */ - orig_mce_callback = x86_mce_decode_callback; - - x86_mce_decode_callback = amd_decode_mce; - } + (boot_cpu_data.x86 >= 0xf)) + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; } @@ -442,7 +445,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { - x86_mce_decode_callback = orig_mce_callback; + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); } MODULE_DESCRIPTION("AMD MCE decoder"); -- cgit v1.2.3-59-g8ed1b From 7a693d3f0d10f978ebdf3082c41404ab97106567 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Oct 2009 08:16:30 +0200 Subject: perf_events, x86: Fix event constraints code There was namespace overlap due to a rename i did - this caused the following build warning, reported by Stephen Rothwell against linux-next x86_64 allmodconfig: arch/x86/kernel/cpu/perf_event.c: In function 'intel_get_event_idx': arch/x86/kernel/cpu/perf_event.c:1445: warning: 'event_constraint' is used uninitialized in this function This is a real bug not just a warning: fix it by renaming the global event-constraints table pointer to 'event_constraints'. Reported-by: Stephen Rothwell Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091013144223.369d616d.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9961d845719d..2e20bca3cca1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraint; +static const struct event_constraint *event_constraints; /* * Not sure about some of these @@ -1442,12 +1442,12 @@ intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) const struct event_constraint *event_constraint; int i, code; - if (!event_constraint) + if (!event_constraints) goto skip; code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - for_each_event_constraint(event_constraint, event_constraint) { + for_each_event_constraint(event_constraint, event_constraints) { if (code == event_constraint->code) { for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { if (!test_and_set_bit(i, cpuc->used_mask)) @@ -2047,12 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2124,14 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); - event_constraint = intel_core_event_constraints; + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - event_constraint = intel_nehalem_event_constraints; + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.3-59-g8ed1b From 8968f9d3dc23d9a1821d97c6f11e72a59382e56c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 13 Oct 2009 16:19:41 +0900 Subject: perf_event, x86, mce: Use TRACE_EVENT() for MCE logging This approach is the first baby step towards solving many of the structural problems the x86 MCE logging code is having today: - It has a private ring-buffer implementation that has a number of limitations and has been historically fragile and buggy. - It is using a quirky /dev/mcelog ioctl driven ABI that is MCE specific. /dev/mcelog is not part of any larger logging framework and hence has remained on the fringes for many years. - The MCE logging code is still very unclean partly due to its ABI limitations. Fields are being reused for multiple purposes, and the whole message structure is limited and x86 specific to begin with. All in one, the x86 tree would like to move away from this private implementation of an event logging facility to a broader framework. By using perf events we gain the following advantages: - Multiple user-space agents can access MCE events. We can have an mcelog daemon running but also a system-wide tracer capturing important events in flight-recorder mode. - Sampling support: the kernel and the user-space call-chain of MCE events can be stored and analyzed as well. This way actual patterns of bad behavior can be matched to precisely what kind of activity happened in the kernel (and/or in the app) around that moment in time. - Coupling with other hardware and software events: the PMU can track a number of other anomalies - monitoring software might chose to monitor those plus the MCE events as well - in one coherent stream of events. - Discovery of MCE sources - tracepoints are enumerated and tools can act upon the existence (or non-existence) of various channels of MCE information. - Filtering support: we just subscribe to and act upon the events we are interested in. Then even on a per event source basis there's in-kernel filter expressions available that can restrict the amount of data that hits the event channel. - Arbitrary deep per cpu buffering of events - we can buffer 32 entries or we can buffer as much as we want, as long as we have the RAM. - An NMI-safe ring-buffer implementation - mappable to user-space. - Built-in support for timestamping of events, PID markers, CPU markers, etc. - A rich ABI accessible over system call interface. Per cpu, per task and per workload monitoring of MCE events can be done this way. The ABI itself has a nice, meaningful structure. - Extensible ABI: new fields can be added without breaking tooling. New tracepoints can be added as the hardware side evolves. There's various parsers that can be used. - Lots of scheduling/buffering/batching modes of operandi for MCE events. poll() support. mmap() support. read() support. You name it. - Rich tooling support: even without any MCE specific extensions added the 'perf' tool today offers various views of MCE data: perf report, perf stat, perf trace can all be used to view logged MCE events and perhaps correlate them to certain user-space usage patterns. But it can be used directly as well, for user-space agents and policy action in mcelog, etc. With this we hope to achieve significant code cleanup and feature improvements in the MCE code, and we hope to be able to drop the /dev/mcelog facility in the end. This patch is just a plain dumb dump of mce_log() records to the tracepoints / perf events framework - a first proof of concept step. Signed-off-by: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: <4AD42A0D.7050104@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++ include/trace/events/mce.h | 69 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 include/trace/events/mce.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..39caea3d8bc3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,9 @@ #include "mce-internal.h" +#define CREATE_TRACE_POINTS +#include + int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -141,6 +144,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h new file mode 100644 index 000000000000..7eee77895cb3 --- /dev/null +++ b/include/trace/events/mce.h @@ -0,0 +1,69 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mce + +#if !defined(_TRACE_MCE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MCE_H + +#include +#include +#include + +TRACE_EVENT(mce_record, + + TP_PROTO(struct mce *m), + + TP_ARGS(m), + + TP_STRUCT__entry( + __field( u64, mcgcap ) + __field( u64, mcgstatus ) + __field( u8, bank ) + __field( u64, status ) + __field( u64, addr ) + __field( u64, misc ) + __field( u64, ip ) + __field( u8, cs ) + __field( u64, tsc ) + __field( u64, walltime ) + __field( u32, cpu ) + __field( u32, cpuid ) + __field( u32, apicid ) + __field( u32, socketid ) + __field( u8, cpuvendor ) + ), + + TP_fast_assign( + __entry->mcgcap = m->mcgcap; + __entry->mcgstatus = m->mcgstatus; + __entry->bank = m->bank; + __entry->status = m->status; + __entry->addr = m->addr; + __entry->misc = m->misc; + __entry->ip = m->ip; + __entry->cs = m->cs; + __entry->tsc = m->tsc; + __entry->walltime = m->time; + __entry->cpu = m->extcpu; + __entry->cpuid = m->cpuid; + __entry->apicid = m->apicid; + __entry->socketid = m->socketid; + __entry->cpuvendor = m->cpuvendor; + ), + + TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x", + __entry->cpu, + __entry->mcgcap, __entry->mcgstatus, + __entry->bank, __entry->status, + __entry->addr, __entry->misc, + __entry->cs, __entry->ip, + __entry->tsc, + __entry->cpuvendor, __entry->cpuid, + __entry->walltime, + __entry->socketid, + __entry->apicid) +); + +#endif /* _TRACE_MCE_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3-59-g8ed1b From 5e09954a9acc3b435ffe318b95afd3c02fae069f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:32 +0200 Subject: x86, mce: Fix up MCE naming nomenclature Prefix global/setup routines with "mcheck_" thus differentiating from the internal facilities prefixed with "mce_". Also, prefix the per cpu calls with mcheck_cpu and rename them to reflect the MCE setup hierarchy of calls better. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-1-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 4 ++-- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 52 ++++++++++++++++++++-------------------- 3 files changed, 29 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 227a72df6441..161485da6838 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,9 +120,9 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE -void mcheck_init(struct cpuinfo_x86 *c); +void mcheck_cpu_init(struct cpuinfo_x86 *c); #else -static inline void mcheck_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..4df69a38be57 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -839,7 +839,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); + mcheck_cpu_init(c); #endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0fb9dc50697e..68d968e69b13 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1136,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1220,7 +1220,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1258,7 +1258,7 @@ static int __cpuinit mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1287,7 +1287,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1355,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1369,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1383,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); @@ -1394,7 +1394,7 @@ static void mce_init_timer(void) *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); + setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1414,26 +1414,26 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); if (raw_smp_processor_id() == 0) @@ -1665,7 +1665,7 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; @@ -1680,12 +1680,12 @@ static int mce_disable(void) static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1695,8 +1695,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1706,8 +1706,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1733,7 +1733,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -2042,7 +2042,7 @@ static __init void mce_init_banks(void) } } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -2070,7 +2070,7 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); +device_initcall(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. @@ -2118,7 +2118,7 @@ static int fake_panic_set(void *data, u64 val) DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mce_debugfs_init(void) +static int __init mcheck_debugfs_init(void) { struct dentry *dmce, *ffake_panic; @@ -2132,5 +2132,5 @@ static int __init mce_debugfs_init(void) return 0; } -late_initcall(mce_debugfs_init); +late_initcall(mcheck_debugfs_init); #endif -- cgit v1.2.3-59-g8ed1b From b33a6363649f0ff83ec81597ea7fe7e688f973cb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:33 +0200 Subject: x86, mce: Add a global MCE init helper Add an early initcall (pre SMP) which sets up global MCE functionality. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-2-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68d968e69b13..80801705edd7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1436,8 +1436,6 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - if (raw_smp_processor_id() == 0) - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* @@ -1657,6 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +static int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + return 0; +} +early_initcall(mcheck_init); + /* * Sysfs support */ -- cgit v1.2.3-59-g8ed1b From 41a48d14f6991020c9bb6b93e289ca5b411ed09a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 5 Oct 2009 19:23:06 +0900 Subject: x86/hw-breakpoints: Actually flush thread breakpoints in flush_thread(). flush_thread() tries to do a TIF_DEBUG check before calling in to flush_thread_hw_breakpoint() (which subsequently clears the thread flag), but for some reason, the x86 code is manually clearing TIF_DEBUG immediately before the test, so this path will never be taken. This kills off the erroneous clear_tsk_thread_flag() and lets flush_thread_hw_breakpoint() actually get invoked. Presumably folks were getting lucky with testing and the free_thread_info() -> free_thread_xstate() path was taking care of the flush there. Signed-off-by: Paul Mundt Acked-by: "K.Prasad" Cc: Ingo Molnar Cc: Alan Stern LKML-Reference: <20091005102306.GA7889@linux-sh.org> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2275ce5776de..cf8ee0016307 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -107,8 +107,6 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -- cgit v1.2.3-59-g8ed1b From 24f1e32c60c45c89a997c73395b69c8af6f0a84e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Sep 2009 19:22:48 +0200 Subject: hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events This patch rebase the implementation of the breakpoints API on top of perf events instances. Each breakpoints are now perf events that handle the register scheduling, thread/cpu attachment, etc.. The new layering is now made as follows: ptrace kgdb ftrace perf syscall \ | / / \ | / / / Core breakpoint API / / | / | / Breakpoints perf events | | Breakpoints PMU ---- Debug Register constraints handling (Part of core breakpoint API) | | Hardware debug registers Reasons of this rewrite: - Use the centralized/optimized pmu registers scheduling, implying an easier arch integration - More powerful register handling: perf attributes (pinned/flexible events, exclusive/non-exclusive, tunable period, etc...) Impact: - New perf ABI: the hardware breakpoints counters - Ptrace breakpoints setting remains tricky and still needs some per thread breakpoints references. Todo (in the order): - Support breakpoints perf counter events for perf tools (ie: implement perf_bpcounter_event()) - Support from perf tools Changes in v2: - Follow the perf "event " rename - The ptrace regression have been fixed (ptrace breakpoint perf events weren't released when a task ended) - Drop the struct hw_breakpoint and store generic fields in perf_event_attr. - Separate core and arch specific headers, drop asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h - Use new generic len/type for breakpoint - Handle off case: when breakpoints api is not supported by an arch Changes in v3: - Fix broken CONFIG_KVM, we need to propagate the breakpoint api changes to kvm when we exit the guest and restore the bp registers to the host. Changes in v4: - Drop the hw_breakpoint_restore() stub as it is only used by KVM - EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a module - Restore the breakpoints unconditionally on kvm guest exit: TIF_DEBUG_THREAD doesn't anymore cover every cases of running breakpoints and vcpu->arch.switch_db_regs might not always be set when the guest used debug registers. (Waiting for a reliable optimization) Changes in v5: - Split-up the asm-generic/hw-breakpoint.h moving to linux/hw_breakpoint.h into a separate patch - Optimize the breakpoints restoring while switching from kvm guest to host. We only want to restore the state if we have active breakpoints to the host, otherwise we don't care about messed-up address registers. - Add asm/hw_breakpoint.h to Kbuild - Fix bad breakpoint type in trace_selftest.c Changes in v6: - Fix wrong header inclusion in trace.h (triggered a build error with CONFIG_FTRACE_SELFTEST Signed-off-by: Frederic Weisbecker Cc: Prasad Cc: Alan Stern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Ingo Molnar Cc: Jan Kiszka Cc: Jiri Slaby Cc: Li Zefan Cc: Avi Kivity Cc: Paul Mackerras Cc: Mike Galbraith Cc: Masami Hiramatsu Cc: Paul Mundt --- arch/Kconfig | 3 + arch/x86/include/asm/Kbuild | 1 + arch/x86/include/asm/debugreg.h | 11 +- arch/x86/include/asm/hw_breakpoint.h | 58 +++-- arch/x86/include/asm/processor.h | 12 +- arch/x86/kernel/hw_breakpoint.c | 391 +++++++++++++++++++++----------- arch/x86/kernel/process.c | 7 +- arch/x86/kernel/process_32.c | 26 +-- arch/x86/kernel/process_64.c | 26 +-- arch/x86/kernel/ptrace.c | 182 ++++++++++----- arch/x86/kernel/smpboot.c | 3 - arch/x86/kvm/x86.c | 18 +- arch/x86/power/cpu.c | 6 - include/linux/hw_breakpoint.h | 243 ++++++++++---------- include/linux/perf_event.h | 26 ++- kernel/exit.c | 5 + kernel/hw_breakpoint.c | 424 ++++++++++++++--------------------- kernel/perf_event.c | 53 ++++- kernel/trace/trace.h | 5 +- kernel/trace/trace_entries.h | 6 +- kernel/trace/trace_ksym.c | 126 +++++------ kernel/trace/trace_selftest.c | 3 +- 22 files changed, 885 insertions(+), 750 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/Kconfig b/arch/Kconfig index acb664397945..eef3bbb97075 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -128,6 +128,9 @@ config HAVE_DEFAULT_NO_SPIN_MUTEXES config HAVE_HW_BREAKPOINT bool + depends on HAVE_PERF_EVENTS + select ANON_INODES + select PERF_EVENTS source "kernel/gcov/Kconfig" diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 4a8e80cdcfa5..9f828f87ca35 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -10,6 +10,7 @@ header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h header-y += processor-flags.h +header-y += hw_breakpoint.h unifdef-y += e820.h unifdef-y += ist.h diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 23439fbb1d0e..9a3333c91f9a 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -75,13 +75,8 @@ */ #ifdef __KERNEL__ -/* For process management */ -extern void flush_thread_hw_breakpoint(struct task_struct *tsk); -extern int copy_thread_hw_breakpoint(struct task_struct *tsk, - struct task_struct *child, unsigned long clone_flags); +DECLARE_PER_CPU(unsigned long, dr7); -/* For CPU management */ -extern void load_debug_registers(void); static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ @@ -94,6 +89,10 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } +#ifdef CONFIG_KVM +extern void hw_breakpoint_restore(void); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 3cfca8e2b5f6..0675a7c4c20e 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -4,6 +4,11 @@ #ifdef __KERNEL__ #define __ARCH_HW_BREAKPOINT_H +/* + * The name should probably be something dealt in + * a higher level. While dealing with the user + * (display/resolving) + */ struct arch_hw_breakpoint { char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; @@ -12,44 +17,57 @@ struct arch_hw_breakpoint { }; #include -#include +#include +#include /* Available HW breakpoint length encodings */ -#define HW_BREAKPOINT_LEN_1 0x40 -#define HW_BREAKPOINT_LEN_2 0x44 -#define HW_BREAKPOINT_LEN_4 0x4c -#define HW_BREAKPOINT_LEN_EXECUTE 0x40 +#define X86_BREAKPOINT_LEN_1 0x40 +#define X86_BREAKPOINT_LEN_2 0x44 +#define X86_BREAKPOINT_LEN_4 0x4c +#define X86_BREAKPOINT_LEN_EXECUTE 0x40 #ifdef CONFIG_X86_64 -#define HW_BREAKPOINT_LEN_8 0x48 +#define X86_BREAKPOINT_LEN_8 0x48 #endif /* Available HW breakpoint type encodings */ /* trigger on instruction execute */ -#define HW_BREAKPOINT_EXECUTE 0x80 +#define X86_BREAKPOINT_EXECUTE 0x80 /* trigger on memory write */ -#define HW_BREAKPOINT_WRITE 0x81 +#define X86_BREAKPOINT_WRITE 0x81 /* trigger on memory read or write */ -#define HW_BREAKPOINT_RW 0x83 +#define X86_BREAKPOINT_RW 0x83 /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -extern struct hw_breakpoint *hbp_kernel[HBP_NUM]; -DECLARE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); -extern unsigned int hbp_user_refcount[HBP_NUM]; +struct perf_event; +struct pmu; -extern void arch_install_thread_hw_breakpoint(struct task_struct *tsk); -extern void arch_uninstall_thread_hw_breakpoint(void); extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len); -extern int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, - struct task_struct *tsk); -extern void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk); -extern void arch_flush_thread_hw_breakpoint(struct task_struct *tsk); -extern void arch_update_kernel_hw_breakpoint(void *); +extern int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, - unsigned long val, void *data); + unsigned long val, void *data); + + +int arch_install_hw_breakpoint(struct perf_event *bp); +void arch_uninstall_hw_breakpoint(struct perf_event *bp); +void hw_breakpoint_pmu_read(struct perf_event *bp); +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp); + +extern void +arch_fill_perf_breakpoint(struct perf_event *bp); + +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type); +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type); + +extern int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type); + +extern struct pmu perf_ops_bp; + #endif /* __KERNEL__ */ #endif /* _I386_HW_BREAKPOINT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 61aafb71c7ef..820f3000f736 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -423,6 +423,8 @@ extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; +struct perf_event; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -444,12 +446,10 @@ struct thread_struct { unsigned long fs; #endif unsigned long gs; - /* Hardware debugging registers: */ - unsigned long debugreg[HBP_NUM]; - unsigned long debugreg6; - unsigned long debugreg7; - /* Hardware breakpoint info */ - struct hw_breakpoint *hbp[HBP_NUM]; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; /* Fault info: */ unsigned long cr2; unsigned long trap_no; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 9316a9de4de3..e622620790bd 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ * * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker */ /* @@ -22,6 +23,8 @@ * using the CPU's debug registers. */ +#include +#include #include #include #include @@ -38,26 +41,24 @@ #include #include -/* Unmasked kernel DR7 value */ -static unsigned long kdr7; +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); /* - * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. - * Used to clear and verify the status of bits corresponding to DR0 - DR3 + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus */ -static const unsigned long dr7_masks[HBP_NUM] = { - 0x000f0003, /* LEN0, R/W0, G0, L0 */ - 0x00f0000c, /* LEN1, R/W1, G1, L1 */ - 0x0f000030, /* LEN2, R/W2, G2, L2 */ - 0xf00000c0 /* LEN3, R/W3, G3, L3 */ -}; +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); /* * Encode the length, type, Exact, and Enable bits for a particular breakpoint * as stored in debug register 7. */ -static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; @@ -68,64 +69,89 @@ static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) return bp_info; } -void arch_update_kernel_hw_breakpoint(void *unused) +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) { - struct hw_breakpoint *bp; - int i, cpu = get_cpu(); - unsigned long temp_kdr7 = 0; - - /* Don't allow debug exceptions while we update the registers */ - set_debugreg(0UL, 7); + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - for (i = hbp_kernel_pos; i < HBP_NUM; i++) { - per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; - if (bp) { - temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); - set_debugreg(bp->info.address, i); - } - } + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; - /* No need to set DR6. Update the debug registers with kernel-space - * breakpoint values from kdr7 and user-space requests from the - * current process - */ - kdr7 = temp_kdr7; - set_debugreg(kdr7 | current->thread.debugreg7, 7); - put_cpu(); + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } /* - * Install the thread breakpoints in their debug registers. + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +int arch_install_hw_breakpoint(struct perf_event *bp) { - struct thread_struct *thread = &(tsk->thread); - - switch (hbp_kernel_pos) { - case 4: - set_debugreg(thread->debugreg[3], 3); - case 3: - set_debugreg(thread->debugreg[2], 2); - case 2: - set_debugreg(thread->debugreg[1], 1); - case 1: - set_debugreg(thread->debugreg[0], 0); - default: - break; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } } - /* No need to set DR6 */ - set_debugreg((kdr7 | thread->debugreg7), 7); + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; } /* - * Install the debug register values for just the kernel, no thread. + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_uninstall_thread_hw_breakpoint(void) +void arch_uninstall_hw_breakpoint(struct perf_event *bp) { - /* Clear the user-space portion of debugreg7 by setting only kdr7 */ - set_debugreg(kdr7, 7); + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + dr7 = &__get_cpu_var(dr7); + *dr7 &= ~encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); } static int get_hbp_len(u8 hbp_len) @@ -133,17 +159,17 @@ static int get_hbp_len(u8 hbp_len) unsigned int len_in_bytes = 0; switch (hbp_len) { - case HW_BREAKPOINT_LEN_1: + case X86_BREAKPOINT_LEN_1: len_in_bytes = 1; break; - case HW_BREAKPOINT_LEN_2: + case X86_BREAKPOINT_LEN_2: len_in_bytes = 2; break; - case HW_BREAKPOINT_LEN_4: + case X86_BREAKPOINT_LEN_4: len_in_bytes = 4; break; #ifdef CONFIG_X86_64 - case HW_BREAKPOINT_LEN_8: + case X86_BREAKPOINT_LEN_8: len_in_bytes = 8; break; #endif @@ -178,67 +204,146 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) /* * Store a breakpoint's encoded address, length, and type. */ -static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +static int arch_store_info(struct perf_event *bp) { - /* - * User-space requests will always have the address field populated - * Symbol names from user-space are rejected - */ - if (tsk && bp->info.name) - return -EINVAL; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); /* * For kernel-addresses, either the address or symbol name can be * specified. */ - if (bp->info.name) - bp->info.address = (unsigned long) - kallsyms_lookup_name(bp->info.name); - if (bp->info.address) + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) return 0; + return -EINVAL; } -/* - * Validate the arch-specific HW Breakpoint register settings - */ -int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, - struct task_struct *tsk) +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) { - unsigned int align; - int ret = -EINVAL; + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } - switch (bp->info.type) { - /* - * Ptrace-refactoring code - * For now, we'll allow instruction breakpoint only for user-space - * addresses - */ - case HW_BREAKPOINT_EXECUTE: - if ((!arch_check_va_in_userspace(bp->info.address, - bp->info.len)) && - bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) - return ret; + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; break; - case HW_BREAKPOINT_WRITE: + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; - case HW_BREAKPOINT_RW: + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; break; default: - return ret; + return -EINVAL; } - switch (bp->info.len) { + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: - align = 0; + info->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - align = 1; + info->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - align = 3; + info->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: align = 7; break; #endif @@ -246,8 +351,8 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, return ret; } - if (bp->triggered) - ret = arch_store_info(bp, tsk); + if (bp->callback) + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -255,44 +360,47 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (bp->info.address & align) + if (info->address & align) return -EINVAL; /* Check that the virtual address is in the proper range */ if (tsk) { - if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + if (!arch_check_va_in_userspace(info->address, info->len)) return -EFAULT; } else { - if (!arch_check_va_in_kernelspace(bp->info.address, - bp->info.len)) + if (!arch_check_va_in_kernelspace(info->address, info->len)) return -EFAULT; } + return 0; } -void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { - struct thread_struct *thread = &(tsk->thread); - struct hw_breakpoint *bp = thread->hbp[pos]; - - thread->debugreg7 &= ~dr7_masks[pos]; - if (bp) { - thread->debugreg[pos] = bp->info.address; - thread->debugreg7 |= encode_dr7(pos, bp->info.len, - bp->info.type); - } else - thread->debugreg[pos] = 0; + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } } -void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +#ifdef CONFIG_KVM +void hw_breakpoint_restore(void) { - int i; - struct thread_struct *thread = &(tsk->thread); - - thread->debugreg7 = 0; - for (i = 0; i < HBP_NUM; i++) - thread->debugreg[i] = 0; + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(dr7), 7); } +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); +#endif /* * Handle debug exception notifications. @@ -313,7 +421,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; - struct hw_breakpoint *bp; + struct perf_event *bp; unsigned long dr7, dr6; unsigned long *dr6_p; @@ -325,10 +433,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - /* Lazy debug register switching */ - if (!test_tsk_thread_flag(current, TIF_DEBUG)) - arch_uninstall_thread_hw_breakpoint(); - get_debugreg(dr7, 7); /* Disable breakpoints during exception handling */ set_debugreg(0UL, 7); @@ -344,17 +448,18 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + /* - * Find the corresponding hw_breakpoint structure and - * invoke its triggered callback. + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. */ - if (i >= hbp_kernel_pos) - bp = per_cpu(this_hbp_kernel[i], cpu); - else { - bp = current->thread.hbp[i]; - if (bp) - rc = NOTIFY_DONE; - } + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -362,19 +467,23 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching - * or due to the delay between updates of hbp_kernel_pos - * and this_hbp_kernel. + * or due to concurrent perf counter removing. */ - if (!bp) - continue; + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); - (bp->triggered)(bp, args->regs); + rcu_read_unlock(); } if (dr6 & (~DR_TRAP_BITS)) rc = NOTIFY_DONE; set_debugreg(dr7, 7); put_cpu(); + return rc; } @@ -389,3 +498,13 @@ int __kprobes hw_breakpoint_exceptions_notify( return hw_breakpoint_handler(data); } + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cf8ee0016307..744508e7cfdd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -47,8 +47,6 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -107,8 +105,7 @@ void flush_thread(void) } #endif - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 209e74801763..d5bd3132ee70 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,7 +59,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -264,9 +263,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) - goto out; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -287,13 +285,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); -out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -437,23 +432,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 72edac026a78..5bafdec34441 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,7 +53,6 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); @@ -244,8 +243,6 @@ void release_thread(struct task_struct *dead_task) BUG(); } } - if (unlikely(dead_task->thread.debugreg7)) - flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -309,9 +306,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, savesegment(ds, p->thread.ds); err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(me, p, clone_flags)) - goto out; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -351,8 +346,6 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); return err; } @@ -508,23 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 267cb85b479c..e79610d95971 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -441,54 +443,59 @@ static int genregs_set(struct task_struct *target, return ret; } -/* - * Decode the length and type bits for a particular breakpoint as - * stored in debug register 7. Return the "enabled" status. - */ -static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, - unsigned *type) -{ - int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - - *len = (bp_info & 0xc) | 0x40; - *type = (bp_info & 0x3) | 0x80; - return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; -} - -static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) +static void ptrace_triggered(struct perf_event *bp, void *data) { - struct thread_struct *thread = &(current->thread); int i; + struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ - for (i = 0; i < hbp_kernel_pos; i++) - /* - * We will check bp->info.address against the address stored in - * thread's hbp structure and not debugreg[i]. This is to ensure - * that the corresponding bit for 'i' in DR7 register is enabled - */ - if (bp->info.address == thread->hbp[i]->info.address) + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) break; + } thread->debugreg6 |= (DR_TRAP0 << i); } +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &(tsk->thread); - unsigned long old_dr7 = thread->debugreg7; + unsigned long old_dr7; int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - struct hw_breakpoint *bp; + int gen_len, gen_type; + struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: /* * Loop through all the hardware breakpoints, making the @@ -496,11 +503,12 @@ restore: */ for (i = 0; i < HBP_NUM; i++) { enabled = decode_dr7(data, i, &len, &type); - bp = thread->hbp[i]; + bp = thread->ptrace_bps[i]; if (!enabled) { if (bp) { - /* Don't unregister the breakpoints right-away, + /* + * Don't unregister the breakpoints right-away, * unless all register_user_hw_breakpoint() * requests have succeeded. This prevents * any window of opportunity for debug @@ -508,27 +516,45 @@ restore: */ if (!second_pass) continue; - unregister_user_hw_breakpoint(tsk, bp); - kfree(bp); + thread->ptrace_bps[i] = NULL; + unregister_hw_breakpoint(bp); } continue; } + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ if (!bp) { - rc = -ENOMEM; - bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); - if (bp) { - bp->info.address = thread->debugreg[i]; - bp->triggered = ptrace_triggered; - bp->info.len = len; - bp->info.type = type; - rc = register_user_hw_breakpoint(tsk, bp); - if (rc) - kfree(bp); - } - } else - rc = modify_user_hw_breakpoint(tsk, bp); + rc = -EINVAL; + break; + } + + rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (rc) break; + + /* + * This is a temporary thing as bp is unregistered/registered + * to simulate modification + */ + bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, + gen_type, bp->callback, + tsk, true); + thread->ptrace_bps[i] = NULL; + + if (!bp) { /* incorrect bp, or we have a bug in bp API */ + rc = -EINVAL; + break; + } + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + bp = NULL; + break; + } + thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -553,15 +579,63 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; - if (n < HBP_NUM) - val = thread->debugreg[n]; - else if (n == 6) + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { val = thread->debugreg6; - else if (n == 7) - val = thread->debugreg7; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } return val; } +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + + if (!t->ptrace_bps[nr]) { + /* + * Put stub len and type to register (reserve) an inactive but + * correct bp + */ + bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, + HW_BREAKPOINT_W, + ptrace_triggered, tsk, + false); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, + bp->attr.bp_type, + bp->callback, + tsk, + bp->attr.disabled); + } + + if (!bp) + return -EIO; + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; + + return 0; +} + /* * Handle PTRACE_POKEUSR calls for the debug register area. */ @@ -575,19 +649,13 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return -EIO; if (n == 6) { - tsk->thread.debugreg6 = val; + thread->debugreg6 = val; goto ret_path; } if (n < HBP_NUM) { - if (thread->hbp[n]) { - if (arch_check_va_in_userspace(val, - thread->hbp[n]->info.len) == 0) { - rc = -EIO; - goto ret_path; - } - thread->hbp[n]->info.address = val; - } - thread->debugreg[n] = val; + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; } /* All that's left is DR7 */ if (n == 7) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 213a7a3e4562..565ebc65920e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include @@ -328,7 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - load_debug_registers(); cpu_idle(); } @@ -1269,7 +1267,6 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); - hw_breakpoint_disable(); } int native_cpu_disable(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc2974adf9b6..22dee7aa7813 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,6 +42,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" +#include #include #include #include @@ -3643,14 +3644,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); - set_debugreg(current->thread.debugreg6, 6); - set_debugreg(current->thread.debugreg7, 7); - } + /* + * If the guest has used debug registers, at least dr7 + * will be disabled while returning to the host. + * If we don't have active breakpoints in the host, we don't + * care about the messed up debug address registers. But if + * we have some of them active, restore the old state. + */ + if (__get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK) + hw_breakpoint_restore(); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index e09a44fc4664..0a979f3e5b8a 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -105,7 +105,6 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); #endif - hw_breakpoint_disable(); } /* Needed by apm.c */ @@ -144,11 +143,6 @@ static void fix_processor_context(void) #endif load_TR_desc(); /* This does ltr */ load_LDT(¤t->active_mm->context); /* This does lldt */ - - /* - * Now maybe reload the debug registers - */ - load_debug_registers(); } /** diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 61ccc8f17eac..7eba9b92e5f3 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -1,136 +1,131 @@ #ifndef _LINUX_HW_BREAKPOINT_H #define _LINUX_HW_BREAKPOINT_H +#include -#ifdef __KERNEL__ -#include -#include -#include - -/** - * struct hw_breakpoint - unified kernel/user-space hardware breakpoint - * @triggered: callback invoked after target address access - * @info: arch-specific breakpoint info (address, length, and type) - * - * %hw_breakpoint structures are the kernel's way of representing - * hardware breakpoints. These are data breakpoints - * (also known as "watchpoints", triggered on data access), and the breakpoint's - * target address can be located in either kernel space or user space. - * - * The breakpoint's address, length, and type are highly - * architecture-specific. The values are encoded in the @info field; you - * specify them when registering the breakpoint. To examine the encoded - * values use hw_breakpoint_get_{kaddress,uaddress,len,type}(), declared - * below. - * - * The address is specified as a regular kernel pointer (for kernel-space - * breakponts) or as an %__user pointer (for user-space breakpoints). - * With register_user_hw_breakpoint(), the address must refer to a - * location in user space. The breakpoint will be active only while the - * requested task is running. Conversely with - * register_kernel_hw_breakpoint(), the address must refer to a location - * in kernel space, and the breakpoint will be active on all CPUs - * regardless of the current task. - * - * The length is the breakpoint's extent in bytes, which is subject to - * certain limitations. include/asm/hw_breakpoint.h contains macros - * defining the available lengths for a specific architecture. Note that - * the address's alignment must match the length. The breakpoint will - * catch accesses to any byte in the range from address to address + - * (length - 1). - * - * The breakpoint's type indicates the sort of access that will cause it - * to trigger. Possible values may include: - * - * %HW_BREAKPOINT_RW (triggered on read or write access), - * %HW_BREAKPOINT_WRITE (triggered on write access), and - * %HW_BREAKPOINT_READ (triggered on read access). - * - * Appropriate macros are defined in include/asm/hw_breakpoint.h; not all - * possibilities are available on all architectures. Execute breakpoints - * must have length equal to the special value %HW_BREAKPOINT_LEN_EXECUTE. - * - * When a breakpoint gets hit, the @triggered callback is - * invoked in_interrupt with a pointer to the %hw_breakpoint structure and the - * processor registers. - * Data breakpoints occur after the memory access has taken place. - * Breakpoints are disabled during execution @triggered, to avoid - * recursive traps and allow unhindered access to breakpointed memory. - * - * This sample code sets a breakpoint on pid_max and registers a callback - * function for writes to that variable. Note that it is not portable - * as written, because not all architectures support HW_BREAKPOINT_LEN_4. - * - * ---------------------------------------------------------------------- - * - * #include - * - * struct hw_breakpoint my_bp; - * - * static void my_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) - * { - * printk(KERN_DEBUG "Inside triggered routine of breakpoint exception\n"); - * dump_stack(); - * ............... - * } - * - * static struct hw_breakpoint my_bp; - * - * static int init_module(void) - * { - * ...................... - * my_bp.info.type = HW_BREAKPOINT_WRITE; - * my_bp.info.len = HW_BREAKPOINT_LEN_4; - * - * my_bp.installed = (void *)my_bp_installed; - * - * rc = register_kernel_hw_breakpoint(&my_bp); - * ...................... - * } - * - * static void cleanup_module(void) - * { - * ...................... - * unregister_kernel_hw_breakpoint(&my_bp); - * ...................... - * } - * - * ---------------------------------------------------------------------- - */ -struct hw_breakpoint { - void (*triggered)(struct hw_breakpoint *, struct pt_regs *); - struct arch_hw_breakpoint info; +enum { + HW_BREAKPOINT_LEN_1 = 1, + HW_BREAKPOINT_LEN_2 = 2, + HW_BREAKPOINT_LEN_4 = 4, + HW_BREAKPOINT_LEN_8 = 8, }; -/* - * len and type values are defined in include/asm/hw_breakpoint.h. - * Available values vary according to the architecture. On i386 the - * possibilities are: - * - * HW_BREAKPOINT_LEN_1 - * HW_BREAKPOINT_LEN_2 - * HW_BREAKPOINT_LEN_4 - * HW_BREAKPOINT_RW - * HW_BREAKPOINT_READ - * - * On other architectures HW_BREAKPOINT_LEN_8 may be available, and the - * 1-, 2-, and 4-byte lengths may be unavailable. There also may be - * HW_BREAKPOINT_WRITE. You can use #ifdef to check at compile time. - */ +enum { + HW_BREAKPOINT_R = 1, + HW_BREAKPOINT_W = 2, + HW_BREAKPOINT_X = 4, +}; + +static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) +{ + return &bp->hw.info; +} + +static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) +{ + return bp->attr.bp_addr; +} + +static inline int hw_breakpoint_type(struct perf_event *bp) +{ + return bp->attr.bp_type; +} + +static inline int hw_breakpoint_len(struct perf_event *bp) +{ + return bp->attr.bp_len; +} + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +extern struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active); + +/* FIXME: only change from the attr, and don't unregister */ +extern struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active); -extern int register_user_hw_breakpoint(struct task_struct *tsk, - struct hw_breakpoint *bp); -extern int modify_user_hw_breakpoint(struct task_struct *tsk, - struct hw_breakpoint *bp); -extern void unregister_user_hw_breakpoint(struct task_struct *tsk, - struct hw_breakpoint *bp); /* * Kernel breakpoints are not associated with any particular thread. */ -extern int register_kernel_hw_breakpoint(struct hw_breakpoint *bp); -extern void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp); +extern struct perf_event * +register_wide_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active); + +extern struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active); + +extern int register_perf_hw_breakpoint(struct perf_event *bp); +extern int __register_perf_hw_breakpoint(struct perf_event *bp); +extern void unregister_hw_breakpoint(struct perf_event *bp); +extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); + +extern int reserve_bp_slot(struct perf_event *bp); +extern void release_bp_slot(struct perf_event *bp); + +extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); + +#else /* !CONFIG_HAVE_HW_BREAKPOINT */ + +static inline struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { return NULL; } +static inline struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { return NULL; } +static inline struct perf_event * +register_wide_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active) { return NULL; } +static inline struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active) { return NULL; } +static inline int +register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } +static inline int +__register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } +static inline void unregister_hw_breakpoint(struct perf_event *bp) { } +static inline void +unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { } +static inline int +reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } +static inline void release_bp_slot(struct perf_event *bp) { } + +static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { } -extern unsigned int hbp_kernel_pos; +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -#endif /* __KERNEL__ */ -#endif /* _LINUX_HW_BREAKPOINT_H */ +#endif /* _LINUX_HW_BREAKPOINT_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8d54e6d25eeb..cead64ea6c15 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -18,6 +18,10 @@ #include #include +#ifdef CONFIG_HAVE_HW_BREAKPOINT +#include +#endif + /* * User-space ABI bits: */ @@ -31,6 +35,7 @@ enum perf_type_id { PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, PERF_TYPE_MAX, /* non-ABI */ }; @@ -207,6 +212,15 @@ struct perf_event_attr { __u32 wakeup_events; /* wakeup every n events */ __u32 wakeup_watermark; /* bytes before wakeup */ }; + + union { + struct { /* Hardware breakpoint info */ + __u64 bp_addr; + __u32 bp_type; + __u32 bp_len; + }; + }; + __u32 __reserved_2; __u64 __reserved_3; @@ -476,6 +490,11 @@ struct hw_perf_event { atomic64_t count; struct hrtimer hrtimer; }; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + union { /* breakpoint */ + struct arch_hw_breakpoint info; + }; +#endif }; atomic64_t prev_count; u64 sample_period; @@ -588,7 +607,7 @@ struct perf_event { u64 tstamp_running; u64 tstamp_stopped; - struct perf_event_attr attr; + struct perf_event_attr attr; struct hw_perf_event hw; struct perf_event_context *ctx; @@ -643,6 +662,8 @@ struct perf_event { perf_callback_t callback; + perf_callback_t event_callback; + #endif /* CONFIG_PERF_EVENTS */ }; @@ -831,6 +852,7 @@ extern int sysctl_perf_event_sample_rate; extern void perf_event_init(void); extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); +extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags #define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ @@ -865,6 +887,8 @@ static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } +static inline void +perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } diff --git a/kernel/exit.c b/kernel/exit.c index e61891f80123..266f8920628a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -979,6 +980,10 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); + /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + flush_ptrace_hw_breakpoint(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c1f64e65a9f3..08f6d0163201 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ * * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker */ /* @@ -35,334 +36,242 @@ #include #include -#include +#include + #include #ifdef CONFIG_X86 #include #endif -/* - * Spinlock that protects all (un)register operations over kernel/user-space - * breakpoint requests - */ -static DEFINE_SPINLOCK(hw_breakpoint_lock); - -/* Array of kernel-space breakpoint structures */ -struct hw_breakpoint *hbp_kernel[HBP_NUM]; - -/* - * Per-processor copy of hbp_kernel[]. Used only when hbp_kernel is being - * modified but we need the older copy to handle any hbp exceptions. It will - * sync with hbp_kernel[] value after updation is done through IPIs. - */ -DEFINE_PER_CPU(struct hw_breakpoint*, this_hbp_kernel[HBP_NUM]); - -/* - * Kernel breakpoints grow downwards, starting from HBP_NUM - * 'hbp_kernel_pos' denotes lowest numbered breakpoint register occupied for - * kernel-space request. We will initialise it here and not in an __init - * routine because load_debug_registers(), which uses this variable can be - * called very early during CPU initialisation. - */ -unsigned int hbp_kernel_pos = HBP_NUM; -/* - * An array containing refcount of threads using a given bkpt register - * Accesses are synchronised by acquiring hw_breakpoint_lock - */ -unsigned int hbp_user_refcount[HBP_NUM]; +static atomic_t bp_slot; -/* - * Load the debug registers during startup of a CPU. - */ -void load_debug_registers(void) +int reserve_bp_slot(struct perf_event *bp) { - unsigned long flags; - struct task_struct *tsk = current; - - spin_lock_bh(&hw_breakpoint_lock); - - /* Prevent IPIs for new kernel breakpoint updates */ - local_irq_save(flags); - arch_update_kernel_hw_breakpoint(NULL); - local_irq_restore(flags); - - if (test_tsk_thread_flag(tsk, TIF_DEBUG)) - arch_install_thread_hw_breakpoint(tsk); - - spin_unlock_bh(&hw_breakpoint_lock); -} + if (atomic_inc_return(&bp_slot) == HBP_NUM) { + atomic_dec(&bp_slot); -/* - * Erase all the hardware breakpoint info associated with a thread. - * - * If tsk != current then tsk must not be usable (for example, a - * child being cleaned up from a failed fork). - */ -void flush_thread_hw_breakpoint(struct task_struct *tsk) -{ - int i; - struct thread_struct *thread = &(tsk->thread); - - spin_lock_bh(&hw_breakpoint_lock); - - /* The thread no longer has any breakpoints associated with it */ - clear_tsk_thread_flag(tsk, TIF_DEBUG); - for (i = 0; i < HBP_NUM; i++) { - if (thread->hbp[i]) { - hbp_user_refcount[i]--; - kfree(thread->hbp[i]); - thread->hbp[i] = NULL; - } + return -ENOSPC; } - arch_flush_thread_hw_breakpoint(tsk); - - /* Actually uninstall the breakpoints if necessary */ - if (tsk == current) - arch_uninstall_thread_hw_breakpoint(); - spin_unlock_bh(&hw_breakpoint_lock); + return 0; } -/* - * Copy the hardware breakpoint info from a thread to its cloned child. - */ -int copy_thread_hw_breakpoint(struct task_struct *tsk, - struct task_struct *child, unsigned long clone_flags) +void release_bp_slot(struct perf_event *bp) { - /* - * We will assume that breakpoint settings are not inherited - * and the child starts out with no debug registers set. - * But what about CLONE_PTRACE? - */ - clear_tsk_thread_flag(child, TIF_DEBUG); - - /* We will call flush routine since the debugregs are not inherited */ - arch_flush_thread_hw_breakpoint(child); - - return 0; + atomic_dec(&bp_slot); } -static int __register_user_hw_breakpoint(int pos, struct task_struct *tsk, - struct hw_breakpoint *bp) +int __register_perf_hw_breakpoint(struct perf_event *bp) { - struct thread_struct *thread = &(tsk->thread); - int rc; + int ret; - /* Do not overcommit. Fail if kernel has used the hbp registers */ - if (pos >= hbp_kernel_pos) - return -ENOSPC; + ret = reserve_bp_slot(bp); + if (ret) + return ret; - rc = arch_validate_hwbkpt_settings(bp, tsk); - if (rc) - return rc; + if (!bp->attr.disabled) + ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); - thread->hbp[pos] = bp; - hbp_user_refcount[pos]++; + return ret; +} - arch_update_user_hw_breakpoint(pos, tsk); - /* - * Does it need to be installed right now? - * Otherwise it will get installed the next time tsk runs - */ - if (tsk == current) - arch_install_thread_hw_breakpoint(tsk); +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + bp->callback = perf_bp_event; - return rc; + return __register_perf_hw_breakpoint(bp); } /* - * Modify the address of a hbp register already in use by the task - * Do not invoke this in-lieu of a __unregister_user_hw_breakpoint() + * Register a breakpoint bound to a task and a given cpu. + * If cpu is -1, the breakpoint is active for the task in every cpu + * If the task is -1, the breakpoint is active for every tasks in the given + * cpu. */ -static int __modify_user_hw_breakpoint(int pos, struct task_struct *tsk, - struct hw_breakpoint *bp) +static struct perf_event * +register_user_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + pid_t pid, + int cpu, + bool active) { - struct thread_struct *thread = &(tsk->thread); - - if ((pos >= hbp_kernel_pos) || (arch_validate_hwbkpt_settings(bp, tsk))) - return -EINVAL; - - if (thread->hbp[pos] == NULL) - return -EINVAL; - - thread->hbp[pos] = bp; + struct perf_event_attr *attr; + struct perf_event *bp; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return ERR_PTR(-ENOMEM); + + attr->type = PERF_TYPE_BREAKPOINT; + attr->size = sizeof(*attr); + attr->bp_addr = addr; + attr->bp_len = len; + attr->bp_type = type; /* - * 'pos' must be that of a hbp register already used by 'tsk' - * Otherwise arch_modify_user_hw_breakpoint() will fail + * Such breakpoints are used by debuggers to trigger signals when + * we hit the excepted memory op. We can't miss such events, they + * must be pinned. */ - arch_update_user_hw_breakpoint(pos, tsk); + attr->pinned = 1; - if (tsk == current) - arch_install_thread_hw_breakpoint(tsk); + if (!active) + attr->disabled = 1; - return 0; -} - -static void __unregister_user_hw_breakpoint(int pos, struct task_struct *tsk) -{ - hbp_user_refcount[pos]--; - tsk->thread.hbp[pos] = NULL; + bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered); + kfree(attr); - arch_update_user_hw_breakpoint(pos, tsk); - - if (tsk == current) - arch_install_thread_hw_breakpoint(tsk); + return bp; } /** * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs - * @bp: the breakpoint structure to register - * - * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and - * @bp->triggered must be set properly before invocation + * @active: should we activate it while registering it * */ -int register_user_hw_breakpoint(struct task_struct *tsk, - struct hw_breakpoint *bp) +struct perf_event * +register_user_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { - struct thread_struct *thread = &(tsk->thread); - int i, rc = -ENOSPC; - - spin_lock_bh(&hw_breakpoint_lock); - - for (i = 0; i < hbp_kernel_pos; i++) { - if (!thread->hbp[i]) { - rc = __register_user_hw_breakpoint(i, tsk, bp); - break; - } - } - if (!rc) - set_tsk_thread_flag(tsk, TIF_DEBUG); - - spin_unlock_bh(&hw_breakpoint_lock); - return rc; + return register_user_hw_breakpoint_cpu(addr, len, type, triggered, + tsk->pid, -1, active); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs - * @bp: the breakpoint structure to unregister - * + * @active: should we activate it while registering it */ -int modify_user_hw_breakpoint(struct task_struct *tsk, struct hw_breakpoint *bp) +struct perf_event * +modify_user_hw_breakpoint(struct perf_event *bp, + unsigned long addr, + int len, + int type, + perf_callback_t triggered, + struct task_struct *tsk, + bool active) { - struct thread_struct *thread = &(tsk->thread); - int i, ret = -ENOENT; + /* + * FIXME: do it without unregistering + * - We don't want to lose our slot + * - If the new bp is incorrect, don't lose the older one + */ + unregister_hw_breakpoint(bp); - spin_lock_bh(&hw_breakpoint_lock); - for (i = 0; i < hbp_kernel_pos; i++) { - if (bp == thread->hbp[i]) { - ret = __modify_user_hw_breakpoint(i, tsk, bp); - break; - } - } - spin_unlock_bh(&hw_breakpoint_lock); - return ret; + return register_user_hw_breakpoint(addr, len, type, triggered, + tsk, active); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); /** - * unregister_user_hw_breakpoint - unregister a user-space hardware breakpoint - * @tsk: pointer to 'task_struct' of the process to which the address belongs + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint * @bp: the breakpoint structure to unregister - * */ -void unregister_user_hw_breakpoint(struct task_struct *tsk, - struct hw_breakpoint *bp) +void unregister_hw_breakpoint(struct perf_event *bp) { - struct thread_struct *thread = &(tsk->thread); - int i, pos = -1, hbp_counter = 0; - - spin_lock_bh(&hw_breakpoint_lock); - for (i = 0; i < hbp_kernel_pos; i++) { - if (thread->hbp[i]) - hbp_counter++; - if (bp == thread->hbp[i]) - pos = i; - } - if (pos >= 0) { - __unregister_user_hw_breakpoint(pos, tsk); - hbp_counter--; - } - if (!hbp_counter) - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - spin_unlock_bh(&hw_breakpoint_lock); + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +static struct perf_event * +register_kernel_hw_breakpoint_cpu(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + int cpu, + bool active) +{ + return register_user_hw_breakpoint_cpu(addr, len, type, triggered, + -1, cpu, active); } -EXPORT_SYMBOL_GPL(unregister_user_hw_breakpoint); /** - * register_kernel_hw_breakpoint - register a hardware breakpoint for kernel space - * @bp: the breakpoint structure to register - * - * @bp.info->name or @bp.info->address, @bp.info->len, @bp.info->type and - * @bp->triggered must be set properly before invocation + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @addr: is the memory address that triggers the breakpoint + * @len: the length of the access to the memory (1 byte, 2 bytes etc...) + * @type: the type of the access to the memory (read/write/exec) + * @triggered: callback to trigger when we hit the breakpoint + * @active: should we activate it while registering it * + * @return a set of per_cpu pointers to perf events */ -int register_kernel_hw_breakpoint(struct hw_breakpoint *bp) +struct perf_event ** +register_wide_hw_breakpoint(unsigned long addr, + int len, + int type, + perf_callback_t triggered, + bool active) { - int rc; + struct perf_event **cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return ERR_PTR(-ENOMEM); - rc = arch_validate_hwbkpt_settings(bp, NULL); - if (rc) - return rc; + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = register_kernel_hw_breakpoint_cpu(addr, len, type, + triggered, cpu, active); - spin_lock_bh(&hw_breakpoint_lock); + *pevent = bp; - rc = -ENOSPC; - /* Check if we are over-committing */ - if ((hbp_kernel_pos > 0) && (!hbp_user_refcount[hbp_kernel_pos-1])) { - hbp_kernel_pos--; - hbp_kernel[hbp_kernel_pos] = bp; - on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1); - rc = 0; + if (IS_ERR(bp) || !bp) { + err = PTR_ERR(bp); + goto fail; + } } - spin_unlock_bh(&hw_breakpoint_lock); - return rc; + return cpu_events; + +fail: + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent) || !*pevent) + break; + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); + /* return the error if any */ + return ERR_PTR(err); } -EXPORT_SYMBOL_GPL(register_kernel_hw_breakpoint); /** - * unregister_kernel_hw_breakpoint - unregister a HW breakpoint for kernel space - * @bp: the breakpoint structure to unregister - * - * Uninstalls and unregisters @bp. + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister */ -void unregister_kernel_hw_breakpoint(struct hw_breakpoint *bp) +void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { - int i, j; - - spin_lock_bh(&hw_breakpoint_lock); - - /* Find the 'bp' in our list of breakpoints for kernel */ - for (i = hbp_kernel_pos; i < HBP_NUM; i++) - if (bp == hbp_kernel[i]) - break; + int cpu; + struct perf_event **pevent; - /* Check if we did not find a match for 'bp'. If so return early */ - if (i == HBP_NUM) { - spin_unlock_bh(&hw_breakpoint_lock); - return; + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); } - - /* - * We'll shift the breakpoints one-level above to compact if - * unregistration creates a hole - */ - for (j = i; j > hbp_kernel_pos; j--) - hbp_kernel[j] = hbp_kernel[j-1]; - - hbp_kernel[hbp_kernel_pos] = NULL; - on_each_cpu(arch_update_kernel_hw_breakpoint, NULL, 1); - hbp_kernel_pos++; - - spin_unlock_bh(&hw_breakpoint_lock); + free_percpu(cpu_events); } -EXPORT_SYMBOL_GPL(unregister_kernel_hw_breakpoint); + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, @@ -374,5 +283,12 @@ static int __init init_hw_breakpoint(void) { return register_die_notifier(&hw_breakpoint_exceptions_nb); } - core_initcall(init_hw_breakpoint); + + +struct pmu perf_ops_bp = { + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, + .unthrottle = hw_breakpoint_pmu_unthrottle +}; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 5087125e2a00..98dc56b2ebe4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -4229,6 +4230,51 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_PROFILE */ +#ifdef CONFIG_HAVE_HW_BREAKPOINT +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + int err; + /* + * The breakpoint is already filled if we haven't created the counter + * through perf syscall + * FIXME: manage to get trigerred to NULL if it comes from syscalls + */ + if (!bp->callback) + err = register_perf_hw_breakpoint(bp); + else + err = __register_perf_hw_breakpoint(bp); + if (err) + return ERR_PTR(err); + + bp->destroy = bp_perf_event_destroy; + + return &perf_ops_bp; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ + /* TODO */ +} +#else +static void bp_perf_event_destroy(struct perf_event *event) +{ +} + +static const struct pmu *bp_perf_event_init(struct perf_event *bp) +{ + return NULL; +} + +void perf_bp_event(struct perf_event *bp, void *regs) +{ +} +#endif + atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) @@ -4375,6 +4421,11 @@ perf_event_alloc(struct perf_event_attr *attr, pmu = tp_perf_event_init(event); break; + case PERF_TYPE_BREAKPOINT: + pmu = bp_perf_event_init(event); + break; + + default: break; } @@ -4686,7 +4737,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) - return NULL ; + return NULL; event = perf_event_alloc(attr, cpu, ctx, NULL, NULL, callback, GFP_KERNEL); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 91c3d0e9a5a1..d72f06ff263f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,14 +11,11 @@ #include #include #include +#include #include #include -#ifdef CONFIG_KSYM_TRACER -#include -#endif - enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e19747d4f860..c16a08f399df 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -372,11 +372,11 @@ FTRACE_ENTRY(ksym_trace, ksym_trace_entry, F_STRUCT( __field( unsigned long, ip ) __field( unsigned char, type ) - __array( char , ksym_name, KSYM_NAME_LEN ) __array( char , cmd, TASK_COMM_LEN ) + __field( unsigned long, addr ) ), - F_printk("ip: %pF type: %d ksym_name: %s cmd: %s", + F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", (void *)__entry->ip, (unsigned int)__entry->type, - __entry->ksym_name, __entry->cmd) + (void *)__entry->addr, __entry->cmd) ); diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 6d5609c67378..fea83eeeef09 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -29,7 +29,11 @@ #include "trace_stat.h" #include "trace.h" -/* For now, let us restrict the no. of symbols traced simultaneously to number +#include +#include + +/* + * For now, let us restrict the no. of symbols traced simultaneously to number * of available hardware breakpoint registers. */ #define KSYM_TRACER_MAX HBP_NUM @@ -37,8 +41,10 @@ #define KSYM_TRACER_OP_LEN 3 /* rw- */ struct trace_ksym { - struct hw_breakpoint *ksym_hbp; + struct perf_event **ksym_hbp; unsigned long ksym_addr; + int type; + int len; #ifdef CONFIG_PROFILE_KSYM_TRACER unsigned long counter; #endif @@ -75,10 +81,11 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) } #endif /* CONFIG_PROFILE_KSYM_TRACER */ -void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) +void ksym_hbp_handler(struct perf_event *hbp, void *data) { struct ring_buffer_event *event; struct ksym_trace_entry *entry; + struct pt_regs *regs = data; struct ring_buffer *buffer; int pc; @@ -96,12 +103,12 @@ void ksym_hbp_handler(struct hw_breakpoint *hbp, struct pt_regs *regs) entry = ring_buffer_event_data(event); entry->ip = instruction_pointer(regs); - entry->type = hbp->info.type; - strlcpy(entry->ksym_name, hbp->info.name, KSYM_SYMBOL_LEN); + entry->type = hw_breakpoint_type(hbp); + entry->addr = hw_breakpoint_addr(hbp); strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); #ifdef CONFIG_PROFILE_KSYM_TRACER - ksym_collect_stats(hbp->info.address); + ksym_collect_stats(hw_breakpoint_addr(hbp)); #endif /* CONFIG_PROFILE_KSYM_TRACER */ trace_buffer_unlock_commit(buffer, event, 0, pc); @@ -120,31 +127,21 @@ static int ksym_trace_get_access_type(char *str) int access = 0; if (str[0] == 'r') - access += 4; - else if (str[0] != '-') - return -EINVAL; + access |= HW_BREAKPOINT_R; if (str[1] == 'w') - access += 2; - else if (str[1] != '-') - return -EINVAL; + access |= HW_BREAKPOINT_W; - if (str[2] != '-') - return -EINVAL; + if (str[2] == 'x') + access |= HW_BREAKPOINT_X; switch (access) { - case 6: - access = HW_BREAKPOINT_RW; - break; - case 4: - access = -EINVAL; - break; - case 2: - access = HW_BREAKPOINT_WRITE; - break; + case HW_BREAKPOINT_W: + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + return access; + default: + return -EINVAL; } - - return access; } /* @@ -194,36 +191,33 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) if (!entry) return -ENOMEM; - entry->ksym_hbp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); - if (!entry->ksym_hbp) - goto err; - - entry->ksym_hbp->info.name = kstrdup(ksymname, GFP_KERNEL); - if (!entry->ksym_hbp->info.name) - goto err; - - entry->ksym_hbp->info.type = op; - entry->ksym_addr = entry->ksym_hbp->info.address = addr; -#ifdef CONFIG_X86 - entry->ksym_hbp->info.len = HW_BREAKPOINT_LEN_4; -#endif - entry->ksym_hbp->triggered = (void *)ksym_hbp_handler; + entry->type = op; + entry->ksym_addr = addr; + entry->len = HW_BREAKPOINT_LEN_4; + + ret = -EAGAIN; + entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr, + entry->len, entry->type, + ksym_hbp_handler, true); + if (IS_ERR(entry->ksym_hbp)) { + entry->ksym_hbp = NULL; + ret = PTR_ERR(entry->ksym_hbp); + } - ret = register_kernel_hw_breakpoint(entry->ksym_hbp); - if (ret < 0) { + if (!entry->ksym_hbp) { printk(KERN_INFO "ksym_tracer request failed. Try again" " later!!\n"); - ret = -EAGAIN; goto err; } + hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); ksym_filter_entry_count++; + return 0; + err: - if (entry->ksym_hbp) - kfree(entry->ksym_hbp->info.name); - kfree(entry->ksym_hbp); kfree(entry); + return ret; } @@ -244,10 +238,10 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, mutex_lock(&ksym_tracer_mutex); hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%s:", entry->ksym_hbp->info.name); - if (entry->ksym_hbp->info.type == HW_BREAKPOINT_WRITE) + ret = trace_seq_printf(s, "%pS:", (void *)entry->ksym_addr); + if (entry->type == HW_BREAKPOINT_W) ret = trace_seq_puts(s, "-w-\n"); - else if (entry->ksym_hbp->info.type == HW_BREAKPOINT_RW) + else if (entry->type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) ret = trace_seq_puts(s, "rw-\n"); WARN_ON_ONCE(!ret); } @@ -269,12 +263,10 @@ static void __ksym_trace_reset(void) mutex_lock(&ksym_tracer_mutex); hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, ksym_hlist) { - unregister_kernel_hw_breakpoint(entry->ksym_hbp); + unregister_wide_hw_breakpoint(entry->ksym_hbp); ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); - kfree(entry->ksym_hbp->info.name); - kfree(entry->ksym_hbp); kfree(entry); } mutex_unlock(&ksym_tracer_mutex); @@ -327,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file, hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { if (entry->ksym_addr == ksym_addr) { /* Check for malformed request: (6) */ - if (entry->ksym_hbp->info.type != op) + if (entry->type != op) changed = 1; else goto out; @@ -335,18 +327,21 @@ static ssize_t ksym_trace_filter_write(struct file *file, } } if (changed) { - unregister_kernel_hw_breakpoint(entry->ksym_hbp); - entry->ksym_hbp->info.type = op; + unregister_wide_hw_breakpoint(entry->ksym_hbp); + entry->type = op; if (op > 0) { - ret = register_kernel_hw_breakpoint(entry->ksym_hbp); - if (ret == 0) + entry->ksym_hbp = + register_wide_hw_breakpoint(entry->ksym_addr, + entry->len, entry->type, + ksym_hbp_handler, true); + if (IS_ERR(entry->ksym_hbp)) + entry->ksym_hbp = NULL; + if (!entry->ksym_hbp) goto out; } ksym_filter_entry_count--; hlist_del_rcu(&(entry->ksym_hlist)); synchronize_rcu(); - kfree(entry->ksym_hbp->info.name); - kfree(entry->ksym_hbp); kfree(entry); ret = 0; goto out; @@ -413,16 +408,16 @@ static enum print_line_t ksym_trace_output(struct trace_iterator *iter) trace_assign_type(field, entry); - ret = trace_seq_printf(s, "%11s-%-5d [%03d] %-30s ", field->cmd, - entry->pid, iter->cpu, field->ksym_name); + ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, + entry->pid, iter->cpu, (char *)field->addr); if (!ret) return TRACE_TYPE_PARTIAL_LINE; switch (field->type) { - case HW_BREAKPOINT_WRITE: + case HW_BREAKPOINT_W: ret = trace_seq_printf(s, " W "); break; - case HW_BREAKPOINT_RW: + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: ret = trace_seq_printf(s, " RW "); break; default: @@ -490,14 +485,13 @@ static int ksym_tracer_stat_show(struct seq_file *m, void *v) entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); - if (entry->ksym_hbp) - access_type = entry->ksym_hbp->info.type; + access_type = entry->type; switch (access_type) { - case HW_BREAKPOINT_WRITE: + case HW_BREAKPOINT_W: seq_puts(m, " W "); break; - case HW_BREAKPOINT_RW: + case HW_BREAKPOINT_R | HW_BREAKPOINT_W: seq_puts(m, " RW "); break; default: diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 7179c12e4f0f..27c5072c2e6b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -828,7 +828,8 @@ trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) ksym_selftest_dummy = 0; /* Register the read-write tracing request */ - ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, HW_BREAKPOINT_RW, + ret = process_new_ksym_entry(KSYM_SELFTEST_ENTRY, + HW_BREAKPOINT_R | HW_BREAKPOINT_W, (unsigned long)(&ksym_selftest_dummy)); if (ret < 0) { -- cgit v1.2.3-59-g8ed1b From a2202aa29289db64ca7988b12343158b67b27f10 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 10 Nov 2009 09:38:24 +0800 Subject: x86: Under BIOS control, restore AP's APIC_LVTTHMR to the BSP value On platforms where the BIOS handles the thermal monitor interrupt, APIC_LVTTHMR on each logical CPU is programmed to generate a SMI and OS must not touch it. Unfortunately AP bringup sequence using INIT-SIPI-SIPI clears all the LVT entries except the mask bit. Essentially this results in all LVT entries including the thermal monitoring interrupt set to masked (clearing the bios programmed value for APIC_LVTTHMR). And this leads to kernel take over the thermal monitoring interrupt on AP's but not on BSP (leaving the bios programmed value only on BSP). As a result of this, we have seen system hangs when the thermal monitoring interrupt is generated. Fix this by reading the initial value of thermal LVT entry on BSP and if bios has taken over the control, then program the same value on all AP's and leave the thermal monitoring interrupt control on all the logical cpu's to the bios. Signed-off-by: Yong Wang Reviewed-by: Suresh Siddha Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <20091110013824.GA24940@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar Cc: stable@kernel.org --- arch/x86/include/asm/mce.h | 9 +++++++++ arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 29 ++++++++++++++++++++++++++++- arch/x86/kernel/setup.c | 3 +++ 5 files changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 161485da6838..858baa061cfc 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -120,8 +120,10 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE +int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); #else +static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif @@ -215,5 +217,12 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); void mce_log_therm_throt_event(__u64 status); + +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); +#else +static inline void mcheck_intel_therm_init(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4df69a38be57..9053be5d95cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 80801705edd7..0d4102031a4c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1655,13 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); -static int __init mcheck_init(void) +int __init mcheck_init(void) { atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + mcheck_intel_therm_init(); + return 0; } -early_initcall(mcheck_init); /* * Sysfs support diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3a1dba75330..7f3cf36ed124 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; + #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) @@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..179c1f2aa457 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -109,6 +109,7 @@ #ifdef CONFIG_X86_64 #include #endif +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -1024,6 +1025,8 @@ void __init setup_arch(char **cmdline_p) #endif #endif x86_init.oem.banner(); + + mcheck_init(); } #ifdef CONFIG_X86_32 -- cgit v1.2.3-59-g8ed1b From 9f6b3c2c30cfbb1166ce7e74a8f9fd93ae19d2de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 9 Nov 2009 21:03:43 +0100 Subject: hw-breakpoints: Fix broken a.out format dump Fix the broken a.out format dump. For now we only dump the ptrace breakpoints. TODO: Dump every perf breakpoints for the current thread, not only ptrace based ones. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/include/asm/a.out-core.h | 10 ++-------- arch/x86/include/asm/debugreg.h | 2 ++ arch/x86/kernel/hw_breakpoint.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index fc4685dd6e4d..7a15588e45d4 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -17,6 +17,7 @@ #include #include +#include /* * fill in the user structure for an a.out core dump @@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg[0]; - dump->u_debugreg[1] = current->thread.debugreg[1]; - dump->u_debugreg[2] = current->thread.debugreg[2]; - dump->u_debugreg[3] = current->thread.debugreg[3]; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; + aout_dump_debugregs(dump); if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9a3333c91f9a..f1b673f08239 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -89,6 +89,8 @@ static inline void hw_breakpoint_disable(void) set_debugreg(0UL, 3); } +extern void aout_dump_debugregs(struct user *dump); + #ifdef CONFIG_KVM extern void hw_breakpoint_restore(void); #endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index e622620790bd..57dcee5fa958 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -375,6 +375,41 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return 0; } +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} + /* * Release the user breakpoints used by ptrace */ -- cgit v1.2.3-59-g8ed1b From ce6b5d768c79b9d5dd6345c033bae781d5ca9b8e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 11 Nov 2009 15:51:25 +0800 Subject: x86: Mark the thermal init functions __init Mark the thermal init functions __init so that the init memory can be freed. Signed-off-by: Yong Wang LKML-Reference: <20091111075125.GA17900@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7f3cf36ed124..8a73d5c12a05 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,7 +256,7 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void mcheck_intel_therm_init(void) +void __init mcheck_intel_therm_init(void) { /* * This function is only called on boot CPU. Save the init thermal @@ -268,7 +268,7 @@ void mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void intel_init_thermal(struct cpuinfo_x86 *c) +void __init intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.3-59-g8ed1b From cffd377e5879ea58522224a785a083f201afd80e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 15:52:40 +0900 Subject: x86, mce: Fix __init annotations The intel_init_thermal() is called from resume path, so it cannot be marked as __init. OTOH mce_banks_init() is only called from __mcheck_cpu_cap_init() which is marked as __cpuinit, so it can be also marked as __cpuinit. Signed-off-by: Hidetoshi Seto Acked-by: Yong Wang LKML-Reference: <4AFBB0B8.2070501@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0d4102031a4c..5f277cad2ed7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1201,7 +1201,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int mce_banks_init(void) +static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; @@ -1242,7 +1242,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) WARN_ON(banks != 0 && b != banks); banks = b; if (!mce_banks) { - int err = mce_banks_init(); + int err = __mcheck_cpu_mce_banks_init(); if (err) return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 8a73d5c12a05..4fef985fc221 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -268,7 +268,7 @@ void __init mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void __init intel_init_thermal(struct cpuinfo_x86 *c) +void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.3-59-g8ed1b From db48cccc7c709ccfa7cb4ac702bc27c216bffee7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 12 Nov 2009 11:25:34 +0900 Subject: perf_event, x86: Annotate init functions and data Annotate init functions and data with __init and __initconst. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <4AFB721E.8070203@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2e20bca3cca1..bd8743024204 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -245,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -336,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -427,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -536,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -1964,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, @@ -1992,7 +1992,7 @@ static struct x86_pmu p6_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, @@ -2016,7 +2016,7 @@ static struct x86_pmu intel_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, @@ -2037,7 +2037,7 @@ static struct x86_pmu amd_pmu = { .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -2071,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -2144,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) -- cgit v1.2.3-59-g8ed1b From 68efa37df779b3e04280598e8b5b3a1919b65fee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Nov 2009 01:35:29 +0100 Subject: hw-breakpoints, x86: Fix modular KVM build This build error: arch/x86/kvm/x86.c:3655: error: implicit declaration of function 'hw_breakpoint_restore' Happens because in the CONFIG_KVM=m case there's no 'CONFIG_KVM' define in the kernel - it's CONFIG_KVM_MODULE in that case. Make the prototype available unconditionally. Cc: Frederic Weisbecker Cc: Prasad LKML-Reference: <1258114575-32655-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/debugreg.h | 2 -- arch/x86/kernel/hw_breakpoint.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 0f6e92af4227..fdabd8435765 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -96,9 +96,7 @@ static inline int hw_breakpoint_active(void) extern void aout_dump_debugregs(struct user *dump); -#ifdef CONFIG_KVM extern void hw_breakpoint_restore(void); -#endif #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 57dcee5fa958..752daebe91c6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -43,6 +43,7 @@ /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, dr7); +EXPORT_PER_CPU_SYMBOL(dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -409,6 +410,7 @@ void aout_dump_debugregs(struct user *dump) dump->u_debugreg[7] = dr7; } +EXPORT_SYMBOL_GPL(aout_dump_debugregs); /* * Release the user breakpoints used by ptrace @@ -424,7 +426,6 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) } } -#ifdef CONFIG_KVM void hw_breakpoint_restore(void) { set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); @@ -435,7 +436,6 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -#endif /* * Handle debug exception notifications. -- cgit v1.2.3-59-g8ed1b From 6e3d8330ae2c4b2c11a9577a0130d2ecda1c610d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Nov 2009 10:19:20 +0100 Subject: perf events: Do not generate function trace entries in perf code Decreases perf overhead when function tracing is enabled, by about 50%. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 1 + kernel/Makefile | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 68537e957a9b..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector diff --git a/kernel/Makefile b/kernel/Makefile index 17b575ec7d07..6b7ce8173dfd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.2.3-59-g8ed1b From ba6909b719a5ccc0c8100d2895bb7ff557b2eeae Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 23 Nov 2009 21:17:13 +0530 Subject: hw-breakpoint: Attribute authorship of hw-breakpoint related files Attribute authorship to developers of hw-breakpoint related files. Signed-off-by: K.Prasad Cc: Alan Stern Cc: Frederic Weisbecker LKML-Reference: <20091123154713.GA5593@in.ibm.com> [ v2: moved it to latest -tip ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 4 ++++ kernel/hw_breakpoint.c | 4 ++++ samples/hw_breakpoint/data_breakpoint.c | 2 ++ 3 files changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 752daebe91c6..4d267fb77828 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -16,6 +16,10 @@ * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation * Copyright (C) 2009 Frederic Weisbecker + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker */ /* diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index b6d6fa273eeb..c16662268d75 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -18,6 +18,10 @@ * Copyright (C) 2009, Frederic Weisbecker * * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker */ /* diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 5bc9819a819e..95063818bcf4 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -23,6 +23,8 @@ * that variable. * * Copyright (C) IBM Corporation, 2009 + * + * Author: K.Prasad */ #include /* Needed by all modules */ #include /* Needed for KERN_INFO */ -- cgit v1.2.3-59-g8ed1b From 0444c9bd0cf4e0eb946a7fcaf34765accfa9404a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Nov 2009 14:03:05 +0000 Subject: x86: Tighten conditionals on MCE related statistics irq_thermal_count is only being maintained when X86_THERMAL_VECTOR, and both X86_THERMAL_VECTOR and X86_MCE_THRESHOLD don't need extra wrapping in X86_MCE conditionals. Signed-off-by: Jan Beulich Cc: Hidetoshi Seto Cc: Yong Wang Cc: Suresh Siddha Cc: Andi Kleen Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <4B06AFA902000078000211F8@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hardirq.h | 6 +++--- arch/x86/kernel/irq.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 82e3e8f01043..108eb6fd1ae7 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -20,11 +20,11 @@ typedef struct { unsigned int irq_call_count; unsigned int irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD unsigned int irq_threshold_count; -# endif #endif } ____cacheline_aligned irq_cpustat_t; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 04bbd5278568..19212cb01558 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); @@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); -- cgit v1.2.3-59-g8ed1b From 1261a02a0c0ab8e643125705f0d1d83e5090e4d1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 24 Nov 2009 05:27:18 -0800 Subject: perf_events, x86: Fix validate_event bug The validate_event() was failing on valid event combinations. The function was assuming that if x86_schedule_event() returned 0, it meant error. But x86_schedule_event() returns the counter index and 0 is a perfectly valid value. An error is returned if the function returns a negative value. Furthermore, validate_event() was also failing for event groups because the event->pmu was not set until after hw_perf_event_init(). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: paulus@samba.org Cc: perfmon2-devel@lists.sourceforge.net Cc: eranian@gmail.com LKML-Reference: <4b0bdf36.1818d00a.07cc.25ae@mx.google.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd8743024204..c1bbed1021d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2229,10 +2229,10 @@ validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) { struct hw_perf_event fake_event = event->hw; - if (event->pmu != &pmu) + if (event->pmu && event->pmu != &pmu) return 0; - return x86_schedule_event(cpuc, &fake_event); + return x86_schedule_event(cpuc, &fake_event) >= 0; } static int validate_group(struct perf_event *event) -- cgit v1.2.3-59-g8ed1b From 28b4e0d86acf59ae3bc422921138a4958458326e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Nov 2009 22:24:44 +0900 Subject: x86: Rename global percpu symbol dr7 to cpu_dr7 Percpu symbols now occupy the same namespace as other global symbols and as such short global symbols without subsystem prefix tend to collide with local variables. dr7 percpu variable used by x86 was hit by this. Rename it to cpu_dr7. The rename also makes it more consistent with its fellow cpu_debugreg percpu variable. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Rusty Russell Cc: Christoph Lameter Cc: Linus Torvalds , Cc: Andrew Morton LKML-Reference: <20091125115856.GA17856@elte.hu> Signed-off-by: Ingo Molnar Reported-by: Stephen Rothwell --- arch/x86/include/asm/debugreg.h | 4 ++-- arch/x86/kernel/hw_breakpoint.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index fdabd8435765..8240f76b531e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -75,7 +75,7 @@ */ #ifdef __KERNEL__ -DECLARE_PER_CPU(unsigned long, dr7); +DECLARE_PER_CPU(unsigned long, cpu_dr7); static inline void hw_breakpoint_disable(void) { @@ -91,7 +91,7 @@ static inline void hw_breakpoint_disable(void) static inline int hw_breakpoint_active(void) { - return __get_cpu_var(dr7) & DR_GLOBAL_ENABLE_MASK; + return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void aout_dump_debugregs(struct user *dump); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d267fb77828..92ea5aad0b5c 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -46,8 +46,8 @@ #include /* Per cpu debug control register value */ -DEFINE_PER_CPU(unsigned long, dr7); -EXPORT_PER_CPU_SYMBOL(dr7); +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -118,7 +118,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) set_debugreg(info->address, i); __get_cpu_var(cpu_debugreg[i]) = info->address; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -153,7 +153,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 &= ~encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -437,7 +437,7 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); set_debugreg(current->thread.debugreg6, 6); - set_debugreg(__get_cpu_var(dr7), 7); + set_debugreg(__get_cpu_var(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -- cgit v1.2.3-59-g8ed1b From 605bfaee9078cd0b01d83402315389839ee4bb5c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 05:35:42 +0100 Subject: hw-breakpoints: Simplify error handling in breakpoint creation requests This simplifies the error handling when we create a breakpoint. We don't need to check the NULL return value corner case anymore since we have improved perf_event_create_kernel_counter() to always return an error code in the failure case. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Steven Rostedt Cc: Prasad LKML-Reference: <1259210142-5714-3-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 8 +------- kernel/hw_breakpoint.c | 4 ++-- kernel/trace/trace_ksym.c | 16 ++++------------ samples/hw_breakpoint/data_breakpoint.c | 3 --- 4 files changed, 7 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b25f8947ed7a..75e0cd847bd6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -657,10 +657,7 @@ restore: tsk, true); thread->ptrace_bps[i] = NULL; - if (!bp) { /* incorrect bp, or we have a bug in bp API */ - rc = -EINVAL; - break; - } + /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); bp = NULL; @@ -729,9 +726,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, tsk, bp->attr.disabled); } - - if (!bp) - return -EIO; /* * CHECKME: the previous code returned -EIO if the addr wasn't a * valid task virtual addr. The new one will return -EINVAL in this diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 06d372fc026d..dd3fb4a999d3 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -442,7 +442,7 @@ register_wide_hw_breakpoint(unsigned long addr, *pevent = bp; - if (IS_ERR(bp) || !bp) { + if (IS_ERR(bp)) { err = PTR_ERR(bp); goto fail; } @@ -453,7 +453,7 @@ register_wide_hw_breakpoint(unsigned long addr, fail: for_each_possible_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - if (IS_ERR(*pevent) || !*pevent) + if (IS_ERR(*pevent)) break; unregister_hw_breakpoint(*pevent); } diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index 9f040e42f516..c538b15b95d6 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -200,12 +200,9 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr, entry->len, entry->type, ksym_hbp_handler, true); + if (IS_ERR(entry->ksym_hbp)) { - entry->ksym_hbp = NULL; ret = PTR_ERR(entry->ksym_hbp); - } - - if (!entry->ksym_hbp) { printk(KERN_INFO "ksym_tracer request failed. Try again" " later!!\n"); goto err; @@ -332,21 +329,16 @@ static ssize_t ksym_trace_filter_write(struct file *file, if (changed) { unregister_wide_hw_breakpoint(entry->ksym_hbp); entry->type = op; + ret = 0; if (op > 0) { entry->ksym_hbp = register_wide_hw_breakpoint(entry->ksym_addr, entry->len, entry->type, ksym_hbp_handler, true); if (IS_ERR(entry->ksym_hbp)) - entry->ksym_hbp = NULL; - - /* modified without problem */ - if (entry->ksym_hbp) { - ret = 0; + ret = PTR_ERR(entry->ksym_hbp); + else goto out; - } - } else { - ret = 0; } /* Error or "symbol:---" case: drop it */ ksym_filter_entry_count--; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 95063818bcf4..ee7f9fbaffbd 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -61,9 +61,6 @@ static int __init hw_break_module_init(void) if (IS_ERR(sample_hbp)) { ret = PTR_ERR(sample_hbp); goto fail; - } else if (!sample_hbp) { - ret = -EINVAL; - goto fail; } printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name); -- cgit v1.2.3-59-g8ed1b From 2c31b7958fd21df9fa04e5c36cda0f063ac70b27 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 06:04:38 +0100 Subject: x86/hw-breakpoints: Don't lose GE flag while disabling a breakpoint When we schedule out a breakpoint from the cpu, we also incidentally remove the "Global exact breakpoint" flag from the breakpoint control register. It makes us losing the fine grained precision about the origin of the instructions that may trigger breakpoint exceptions for the other breakpoints running in this cpu. Reported-by: Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259211878-6013-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 92ea5aad0b5c..d42f65ac4927 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -59,21 +59,27 @@ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); -/* - * Encode the length, type, Exact, and Enable bits for a particular breakpoint - * as stored in debug register 7. - */ -unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; bp_info = (len | type) & 0xf; bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); - bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | - DR_GLOBAL_SLOWDOWN; + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + return bp_info; } +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + /* * Decode the length and type bits for a particular breakpoint as * stored in debug register 7. Return the "enabled" status. @@ -154,7 +160,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) return; dr7 = &__get_cpu_var(cpu_dr7); - *dr7 &= ~encode_dr7(i, info->len, info->type); + *dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); } -- cgit v1.2.3-59-g8ed1b From 767df1bdd8cbff2c8c40c9ac8295bbdaa5fb24c4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 17:29:02 +0900 Subject: x86, mce: Add __cpuinit to hotplug callback functions The mce_disable_cpu() and mce_reenable_cpu() are called only from mce_cpu_callback() which is marked as __cpuinit. So these functions can be __cpuinit too. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen LKML-Reference: <4B0E3C4E.4090809@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5f277cad2ed7..0bcaa3875863 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1953,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { @@ -1970,7 +1971,7 @@ static void mce_disable_cpu(void *h) } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; -- cgit v1.2.3-59-g8ed1b From 5fa10b28e57f94a90535cfeafe89dcee9f47d540 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Nov 2009 04:55:53 +0100 Subject: hw-breakpoints: Use struct perf_event_attr to define user breakpoints In-kernel user breakpoints are created using functions in which we pass breakpoint parameters as individual variables: address, length and type. Although it fits well for x86, this just does not scale across archictectures that may support this api later as these may have more or different needs. Pass in a perf_event_attr structure instead because it is meant to evolve as much as possible into a generic hardware breakpoint parameter structure. Reported-by: K.Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259294154-5197-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 74 ++++++++++++++++++++---------------- include/linux/hw_breakpoint.h | 36 ++++++++---------- kernel/hw_breakpoint.c | 87 +++++++++---------------------------------- 3 files changed, 75 insertions(+), 122 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 75e0cd847bd6..2941b32ea666 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -593,6 +593,34 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk) +{ + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); + + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); + + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = 0; + + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + /* * Handle ptrace writes to debug register 7. */ @@ -603,7 +631,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - int gen_len, gen_type; struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; @@ -634,33 +661,12 @@ restore: continue; } - /* - * We shoud have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) { - rc = -EINVAL; - break; - } - - rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); - if (rc) - break; - - /* - * This is a temporary thing as bp is unregistered/registered - * to simulate modification - */ - bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, - gen_type, bp->callback, - tsk, true); - thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, tsk); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); - bp = NULL; + thread->ptrace_bps[i] = NULL; break; } thread->ptrace_bps[i] = bp; @@ -707,24 +713,26 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); if (!t->ptrace_bps[nr]) { /* * Put stub len and type to register (reserve) an inactive but * correct bp */ - bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, - HW_BREAKPOINT_W, - ptrace_triggered, tsk, - false); + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); } else { bp = t->ptrace_bps[nr]; t->ptrace_bps[nr] = NULL; - bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, - bp->attr.bp_type, - bp->callback, - tsk, - bp->attr.disabled); + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index c9f7f7c7b0e0..5da472e434b7 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -20,6 +20,14 @@ enum { #ifdef CONFIG_HAVE_HW_BREAKPOINT +/* As it's for in-kernel or ptrace use, we want it to be pinned */ +#define DEFINE_BREAKPOINT_ATTR(name) \ +struct perf_event_attr name = { \ + .type = PERF_TYPE_BREAKPOINT, \ + .size = sizeof(name), \ + .pinned = 1, \ +}; + static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) { return bp->attr.bp_addr; @@ -36,22 +44,16 @@ static inline int hw_breakpoint_len(struct perf_event *bp) } extern struct perf_event * -register_user_hw_breakpoint(unsigned long addr, - int len, - int type, +register_user_hw_breakpoint(struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active); + struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ extern struct perf_event * modify_user_hw_breakpoint(struct perf_event *bp, - unsigned long addr, - int len, - int type, + struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active); + struct task_struct *tsk); /* * Kernel breakpoints are not associated with any particular thread. @@ -89,20 +91,14 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) #else /* !CONFIG_HAVE_HW_BREAKPOINT */ static inline struct perf_event * -register_user_hw_breakpoint(unsigned long addr, - int len, - int type, +register_user_hw_breakpoint(struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active) { return NULL; } + struct task_struct *tsk) { return NULL; } static inline struct perf_event * modify_user_hw_breakpoint(struct perf_event *bp, - unsigned long addr, - int len, - int type, + struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active) { return NULL; } + struct task_struct *tsk) { return NULL; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(unsigned long addr, int len, diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 32e1018191be..2a47514f12fd 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -289,90 +289,32 @@ int register_perf_hw_breakpoint(struct perf_event *bp) return __register_perf_hw_breakpoint(bp); } -/* - * Register a breakpoint bound to a task and a given cpu. - * If cpu is -1, the breakpoint is active for the task in every cpu - * If the task is -1, the breakpoint is active for every tasks in the given - * cpu. - */ -static struct perf_event * -register_user_hw_breakpoint_cpu(unsigned long addr, - int len, - int type, - perf_callback_t triggered, - pid_t pid, - int cpu, - bool active) -{ - struct perf_event_attr *attr; - struct perf_event *bp; - - attr = kzalloc(sizeof(*attr), GFP_KERNEL); - if (!attr) - return ERR_PTR(-ENOMEM); - - attr->type = PERF_TYPE_BREAKPOINT; - attr->size = sizeof(*attr); - attr->bp_addr = addr; - attr->bp_len = len; - attr->bp_type = type; - /* - * Such breakpoints are used by debuggers to trigger signals when - * we hit the excepted memory op. We can't miss such events, they - * must be pinned. - */ - attr->pinned = 1; - - if (!active) - attr->disabled = 1; - - bp = perf_event_create_kernel_counter(attr, cpu, pid, triggered); - kfree(attr); - - return bp; -} - /** * register_user_hw_breakpoint - register a hardware breakpoint for user space - * @addr: is the memory address that triggers the breakpoint - * @len: the length of the access to the memory (1 byte, 2 bytes etc...) - * @type: the type of the access to the memory (read/write/exec) + * @attr: breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs - * @active: should we activate it while registering it - * */ struct perf_event * -register_user_hw_breakpoint(unsigned long addr, - int len, - int type, +register_user_hw_breakpoint(struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active) + struct task_struct *tsk) { - return register_user_hw_breakpoint_cpu(addr, len, type, triggered, - tsk->pid, -1, active); + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); /** * modify_user_hw_breakpoint - modify a user-space hardware breakpoint * @bp: the breakpoint structure to modify - * @addr: is the memory address that triggers the breakpoint - * @len: the length of the access to the memory (1 byte, 2 bytes etc...) - * @type: the type of the access to the memory (read/write/exec) + * @attr: new breakpoint attributes * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs - * @active: should we activate it while registering it */ struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, - unsigned long addr, - int len, - int type, +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, perf_callback_t triggered, - struct task_struct *tsk, - bool active) + struct task_struct *tsk) { /* * FIXME: do it without unregistering @@ -381,8 +323,7 @@ modify_user_hw_breakpoint(struct perf_event *bp, */ unregister_hw_breakpoint(bp); - return register_user_hw_breakpoint(addr, len, type, triggered, - tsk, active); + return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); @@ -406,8 +347,16 @@ register_kernel_hw_breakpoint_cpu(unsigned long addr, int cpu, bool active) { - return register_user_hw_breakpoint_cpu(addr, len, type, triggered, - -1, cpu, active); + DEFINE_BREAKPOINT_ATTR(attr); + + attr.bp_addr = addr; + attr.bp_len = len; + attr.bp_type = type; + + if (!active) + attr.disabled = 1; + + return perf_event_create_kernel_counter(&attr, cpu, -1, triggered); } /** -- cgit v1.2.3-59-g8ed1b From 1cedae72904b85462082dbcfd5190309ba37f8bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2009 07:32:16 +0100 Subject: hw-breakpoints: Keep track of user disabled breakpoints When we disable a breakpoint through dr7, we unregister it right away, making us lose track of its corresponding address register value. It means that the following sequence would be unsupported: - set address in dr0 - enable it through dr7 - disable it through dr7 - enable it through dr7 because we lost the address register value when we disabled the breakpoint. Don't unregister the disabled breakpoints but rather disable them. Reported-by: "K.Prasad" Signed-off-by: Frederic Weisbecker LKML-Reference: <1259735536-9236-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2941b32ea666..04d182a7cfdb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) static struct perf_event * ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk) + struct task_struct *tsk, int disabled) { int err; int gen_len, gen_type; @@ -616,7 +616,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr = bp->attr; attr.bp_len = gen_len; attr.bp_type = gen_type; - attr.disabled = 0; + attr.disabled = disabled; return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } @@ -655,13 +655,21 @@ restore: */ if (!second_pass) continue; + thread->ptrace_bps[i] = NULL; - unregister_hw_breakpoint(bp); + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk); + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { -- cgit v1.2.3-59-g8ed1b