From c00b275043adc14d668f36266b890f0c53d46640 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:44 +0200 Subject: uprobes: Optimize is_swbp_at_addr() for current->mm Change is_swbp_at_addr() to try to avoid the costly read_opcode() if mm == current->mm, __copy_from_user_inatomic() should succeed in the likely case. Currently this optimization is not important, but we are going to add more is_swbp_at_addr(current->mm) callers. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192744.GA8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 985be4d80fe8..d0f5ec0dcdea 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -333,10 +333,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) uprobe_opcode_t opcode; int result; + if (current->mm == mm) { + pagefault_disable(); + result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, + sizeof(opcode)); + pagefault_enable(); + + if (likely(result == 0)) + goto out; + } + result = read_opcode(mm, vaddr, &opcode); if (result) return result; - +out: if (is_swbp_insn(&opcode)) return 1; -- cgit v1.2.3-59-g8ed1b From a3d7bb47937b3a40b9f0c75655e97b3bb6407cbe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:27:59 +0200 Subject: uprobes: Change read_opcode() to use FOLL_FORCE set_orig_insn()->read_opcode() should not fail if the probed task did mprotect() after uprobe_register(), change it to use FOLL_FORCE. Without FOLL_WRITE this doesn't have any "side" effect but allows to read the !VM_READ memory. There is another reason for this change, we are going to use is_swbp_at_addr() from handle_swbp() which can race with another thread doing mprotect(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192759.GB8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d0f5ec0dcdea..a0dbc87a2ec6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -312,7 +312,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ void *vaddr_new; int ret; - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL); + ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (ret <= 0) return ret; -- cgit v1.2.3-59-g8ed1b From 3a9ea0520f38def4a3915b91f82455b749f07d88 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:28:57 +0200 Subject: uprobes: Introduce find_active_uprobe() helper No functional changes. Move the "find uprobe" code from handle_swbp() to the new helper, find_active_uprobe(). Note: with or without this change, the find-active-uprobe logic is not exactly right. We can race with another thread which unmaps the memory with the valid uprobe before we take mm->mmap_sem. We can't find this uprobe simply because find_vma() fails. In this case we wrongly assume that this trap was not caused by uprobe and send the erroneous SIGTRAP. See the next changes. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192857.GC8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a0dbc87a2ec6..eaf4d55fd424 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,38 +1489,47 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } -/* - * Run handler and ask thread to singlestep. - * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. - */ -static void handle_swbp(struct pt_regs *regs) +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) { + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; struct vm_area_struct *vma; - struct uprobe_task *utask; - struct uprobe *uprobe; - struct mm_struct *mm; - unsigned long bp_vaddr; - uprobe = NULL; - bp_vaddr = uprobe_get_swbp_addr(regs); - mm = current->mm; down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; + if (vma && vma->vm_start <= bp_vaddr) { + if (valid_vma(vma, false)) { + struct inode *inode; + loff_t offset; - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); - uprobe = find_uprobe(inode, offset); + inode = vma->vm_file->f_mapping->host; + offset = bp_vaddr - vma->vm_start; + offset += (vma->vm_pgoff << PAGE_SHIFT); + uprobe = find_uprobe(inode, offset); + } } srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); current->uprobe_srcu_id = -1; up_read(&mm->mmap_sem); + return uprobe; +} + +/* + * Run handler and ask thread to singlestep. + * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. + */ +static void handle_swbp(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct uprobe *uprobe; + unsigned long bp_vaddr; + + bp_vaddr = uprobe_get_swbp_addr(regs); + uprobe = find_active_uprobe(bp_vaddr); + if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ send_sig(SIGTRAP, current, 0); -- cgit v1.2.3-59-g8ed1b From d790d34653ab20c74034902f5f0889bba807949a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:14 +0200 Subject: uprobes: Teach find_active_uprobe() to provide the "is_swbp" info A separate patch to simplify the review, and for the documentation. The patch adds another "int *is_swbp" argument to find_active_uprobe(), so far its only caller doesn't use this info. With this patch find_active_uprobe() additionally does: - if find_vma() + ->vm_start check fails, *is_swbp = -EFAULT - otherwise, if valid_vma() + find_uprobe() fails, it holds the result of is_swbp_at_addr(), can be negative too. The latter is only possible if we raced with another thread which did munmap/etc after we hit this bp. IOW. If find_active_uprobe(&is_swbp) returns NULL, the caller can look at is_swbp to figure out whether the current insn is bp or not, or detect the race with another thread if it is negative. Note: I think that performance-wise this change is fine. This adds is_swbp_at_addr(), but only if we raced with uprobe_unregister() or if we hit the "normal" int3 but this mm has uprobes as well. And even in this case the slow read_opcode() path is very unlikely, this insn recently triggered do_int3(), __copy_from_user_inatomic() shouldn't fail in the likely case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192914.GD8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eaf4d55fd424..ee3df704e78a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1489,7 +1489,7 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) +static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -1497,7 +1497,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) down_read(&mm->mmap_sem); vma = find_vma(mm, bp_vaddr); - if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { struct inode *inode; @@ -1508,6 +1507,11 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr) offset += (vma->vm_pgoff << PAGE_SHIFT); uprobe = find_uprobe(inode, offset); } + + if (!uprobe) + *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + } else { + *is_swbp = -EFAULT; } srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); @@ -1526,9 +1530,10 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr); + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { /* No matching uprobe; signal SIGTRAP. */ -- cgit v1.2.3-59-g8ed1b From 77fc4af1b59d12ab3b1467adf0a5204806853123 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:28 +0200 Subject: uprobes: Change register_for_each_vma() to take mm->mmap_sem for writing Change register_for_each_vma() to take mm->mmap_sem for writing. This is a bit unfortunate but hopefully not too bad, this is the slow path anyway. This is needed to ensure that find_active_uprobe() can not race with uprobe_register() which adds the new bp at the same bp_vaddr, after find_uprobe() fails and before is_swbp_at_addr_fast() checks the memory. IOW, this is needed to ensure that if find_active_uprobe() returns NULL but is_swbp == true, we can safely assume that it was the "normal" int3 and we should send SIGTRAP. There is another reason for this change. We are going to replace uprobes_state->count with MMF_ flags set by register/unregister and cleared by find_active_uprobe(), and set/clear shouldn't race with each other. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192928.GE8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ee3df704e78a..a2ed82b4808c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -853,12 +853,12 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) } mm = vi->mm; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, (unsigned long)vi->vaddr); if (!vma || !valid_vma(vma, is_register)) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -867,7 +867,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr != vi->vaddr) { list_del(&vi->probe_list); kfree(vi); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); continue; } @@ -877,7 +877,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) else remove_breakpoint(uprobe, mm, vi->vaddr); - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); mmput(mm); if (is_register) { if (ret && ret == -EEXIST) -- cgit v1.2.3-59-g8ed1b From 56bb4cf6475d702d2fb00fc641aa6441097c0330 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:29:47 +0200 Subject: uprobes: Teach handle_swbp() to rely on "is_swbp" rather than uprobes_srcu Currently handle_swbp() assumes that it can't race with unregister, so it roughly does: if (find_uprobe(vaddr)) process_uprobe(); else send_sig(SIGTRAP); This relies on the not-really-working uprobes_srcu code we are going to remove, see the next patch. With this patch we rely on the result of is_swbp_at_addr(bp_vaddr) if find_uprobe() fails. If is_swbp == 1, then we hit the normal int3, we should send SIGTRAP. If is_swbp == 0, we raced with uprobe_unregister(), we simply restart this insn again. The "difficult" case is is_swbp == -EFAULT, when we can't read this memory. In this case I think we should restart too, and this is more correct compared to the current code which sends SIGTRAP. Ignoring ENOMEM/etc from get_user_pages(), this can only happen if another thread unmaps this memory before find_active_uprobe() takes mmap_sem. It would be better to pretend it was unmapped before this insn was executed, restart, and get SIGSEGV. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529192947.GF8057@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2ed82b4808c..1f02e3bbfc1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1530,14 +1530,26 @@ static void handle_swbp(struct pt_regs *regs) struct uprobe_task *utask; struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { - /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + if (is_swbp > 0) { + /* No matching uprobe; signal SIGTRAP. */ + send_sig(SIGTRAP, current, 0); + } else { + /* + * Either we raced with uprobe_unregister() or we can't + * access this memory. The latter is only possible if + * another thread plays with our ->mm. In both cases + * we can simply restart. If this vma was unmapped we + * can pretend this insn was not executed yet and get + * the (correct) SIGSEGV after restart. + */ + instruction_pointer_set(regs, bp_vaddr); + } return; } -- cgit v1.2.3-59-g8ed1b From 778b032d96909690c19d84f8d17c13be65ed6f8e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 May 2012 21:30:08 +0200 Subject: uprobes: Kill uprobes_srcu/uprobe_srcu_id Kill the no longer needed uprobes_srcu/uprobe_srcu_id code. It doesn't really work anyway. synchronize_srcu() can only synchronize with the code "inside" the srcu_read_lock/srcu_read_unlock section, while uprobe_pre_sstep_notifier() does srcu_read_lock() _after_ we already hit the breakpoint. I guess this probably works "in practice". synchronize_srcu() is slow and it implies synchronize_sched(), and the probed task enters the non- preemptible section at the start of exception handler. Still this is not right at least in theory, and task->uprobe_srcu_id blows task_struct. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120529193008.GG8057@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/events/uprobes.c | 22 +++------------------- 2 files changed, 3 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c54476..6bd19655c1a7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1569,7 +1569,6 @@ struct task_struct { #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; - int uprobe_srcu_id; #endif }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1f02e3bbfc1d..8c5e043cd309 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -38,7 +38,6 @@ #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE -static struct srcu_struct uprobes_srcu; static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ @@ -738,20 +737,14 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) } /* - * There could be threads that have hit the breakpoint and are entering the - * notifier code and trying to acquire the uprobes_treelock. The thread - * calling delete_uprobe() that is removing the uprobe from the rb_tree can - * race with these threads and might acquire the uprobes_treelock compared - * to some of the breakpoint hit threads. In such a case, the breakpoint - * hit threads will not find the uprobe. The current unregistering thread - * waits till all other threads have hit a breakpoint, to acquire the - * uprobes_treelock before the uprobe is removed from the rbtree. + * There could be threads that have already hit the breakpoint. They + * will recheck the current insn and restart if find_uprobe() fails. + * See find_active_uprobe(). */ static void delete_uprobe(struct uprobe *uprobe) { unsigned long flags; - synchronize_srcu(&uprobes_srcu); spin_lock_irqsave(&uprobes_treelock, flags); rb_erase(&uprobe->rb_node, &uprobes_tree); spin_unlock_irqrestore(&uprobes_treelock, flags); @@ -1388,9 +1381,6 @@ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - if (t->uprobe_srcu_id != -1) - srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id); - if (!utask) return; @@ -1408,7 +1398,6 @@ void uprobe_free_utask(struct task_struct *t) void uprobe_copy_process(struct task_struct *t) { t->utask = NULL; - t->uprobe_srcu_id = -1; } /* @@ -1513,9 +1502,6 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } - - srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id); - current->uprobe_srcu_id = -1; up_read(&mm->mmap_sem); return uprobe; @@ -1656,7 +1642,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) utask->state = UTASK_BP_HIT; set_thread_flag(TIF_UPROBE); - current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu); return 1; } @@ -1691,7 +1676,6 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mutex[i]); mutex_init(&uprobes_mmap_mutex[i]); } - init_srcu_struct(&uprobes_srcu); return register_die_notifier(&uprobe_exception_nb); } -- cgit v1.2.3-59-g8ed1b From 7eb9ba5ed312ec6ed9d22259c5da1acb7cf4bd29 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Fri, 8 Jun 2012 15:02:57 +0530 Subject: uprobes: Pass probed vaddr to arch_uprobe_analyze_insn() On RISC architectures like powerpc, instructions are fixed size. Instruction analysis on such platforms is just a matter of (insn % 4). Pass the vaddr at which the uprobe is to be inserted so that arch_uprobe_analyze_insn() can flag misaligned registration requests. Signed-off-by: Ananth N Mavinakaynahalli Cc: michael@ellerman.id.au Cc: antonb@thinktux.localdomain Cc: Paul Mackerras Cc: benh@kernel.crashing.org Cc: peterz@infradead.org Cc: Srikar Dronamraju Cc: Jim Keniston Cc: oleg@redhat.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20120608093257.GG13409@in.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uprobes.h | 2 +- arch/x86/kernel/uprobes.c | 3 ++- kernel/events/uprobes.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7ae..f3971bbcd1de 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task { #endif }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d96..36fd42091fa7 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8c5e043cd309..b52376d02332 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -706,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -EEXIST; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From 8d240dd88cca33b704adf3fe281aa64b5aac2dd8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 29 Mar 2012 19:11:40 +0200 Subject: ftrace: Remove a superfluous check register_ftrace_function() checks ftrace_disabled and calls __register_ftrace_function which does it again. Drop the first check and add the unlikely hint to the second one. Also, drop the label as John correctly notices. No functional change. Link: http://lkml.kernel.org/r/20120329171140.GE6409@aftab Cc: Borislav Petkov Cc: John Kacur Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a008663d86c8..b4f20fba09fc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (ftrace_disabled) + if (unlikely(ftrace_disabled)) return -ENODEV; if (FTRACE_WARN_ON(ops == &global_ops)) @@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) - goto out_unlock; - ret = __register_ftrace_function(ops); if (!ret) ret = ftrace_startup(ops, 0); - - out_unlock: mutex_unlock(&ftrace_lock); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_function); -- cgit v1.2.3-59-g8ed1b From 7374e82771c6d5a9af2080be46f64a5826c7efb1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 31 May 2012 21:40:05 -0400 Subject: tracing: Register the ftrace internal events during early boot All trace events including ftrace internel events (like trace_printk and function tracing), register functions that describe how to print their output. The events may be recorded as soon as the ring buffer is allocated, but they are just raw binary in the buffer. The mapping of event ids to how to print them are held within a structure that is registered on system boot. If a crash happens in boot up before these functions are registered then their output (via ftrace_dump_on_oops) will be useless: Dumping ftrace buffer: --------------------------------- <...>-1 0.... 319705us : Unknown type 6 --------------------------------- This can be quite frustrating for a kernel developer trying to see what is going wrong. There's no reason to register them so late in the boot up process. They can be registered by early_initcall(). Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index df611a0e76c5..123b189c732c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1325,4 +1325,4 @@ __init static int init_events(void) return 0; } -device_initcall(init_events); +early_initcall(init_events); -- cgit v1.2.3-59-g8ed1b From ea131377148cdfe90641b42ae9aa5a6b3a4fa327 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:22 +0200 Subject: uprobes: Valid_vma() should reject VM_HUGETLB __replace_page() obviously can't work with the hugetlbfs mappings, uprobe_register() will likely crash the kernel. Change valid_vma() to check VM_HUGETLB as well. As for PageTransHuge() no need to worry, vma->vm_file != NULL. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154322.GA9561@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b52376d02332..f0d04530af63 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -99,7 +99,8 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) if (!is_register) return true; - if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC)) + if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) + == (VM_READ|VM_EXEC)) return true; return false; -- cgit v1.2.3-59-g8ed1b From cc359d180fa9c25a4c1819f17e07a422d788353d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:25 +0200 Subject: uprobes: __copy_insn() should ensure a_ops->readpage != NULL __copy_insn() blindly calls read_mapping_page(), this will crash the kernel if ->readpage == NULL, add the necessary check. For example, hugetlbfs_aops->readpage is NULL. Perhaps we should change read_mapping_page() instead. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154325.GA9568@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0d04530af63..604930bf9c92 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -610,6 +610,9 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins if (!filp) return -EINVAL; + if (!mapping->a_ops->readpage) + return -EIO; + idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); off1 = offset &= ~PAGE_MASK; -- cgit v1.2.3-59-g8ed1b From 5323ce71e4b4e1f188ebbc0cc7776885ea6c75fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:28 +0200 Subject: uprobes: Write_opcode()->__replace_page() can race with try_to_unmap() write_opcode() gets old_page via get_user_pages() and then calls __replace_page() which assumes that this old_page is still mapped after pte_offset_map_lock(). This is not true if this old_page was already try_to_unmap()'ed, and in this case everything __replace_page() does with old_page is wrong. Just for example, put_page() is not balanced. I think it is possible to teach __replace_page() to handle this unlikely case correctly, but this patch simply changes it to use page_check_address() and return -EAGAIN if it fails. The caller should notice this error code and retry. Note: write_opcode() asks for the cleanups, I'll try to do this in a separate patch. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154328.GA9571@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 604930bf9c92..3ccdb29ee8d6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -129,33 +129,17 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; unsigned long addr; - int err = -EFAULT; + spinlock_t *ptl; + pte_t *ptep; addr = page_address_in_vma(page, vma); if (addr == -EFAULT) - goto out; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + return -EFAULT; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - goto out; + return -EAGAIN; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -174,10 +158,8 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); - err = 0; -out: - return err; + return 0; } /** @@ -222,9 +204,10 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; + unsigned long pgoff; loff_t addr; int ret; - +retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) @@ -269,9 +252,9 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, memcpy(vaddr_new, vaddr_old, PAGE_SIZE); /* poke the new insn in, ASSUMES we don't cross page boundary */ - vaddr &= ~PAGE_MASK; - BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + pgoff = (vaddr & ~PAGE_MASK); + BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -291,6 +274,8 @@ unlock_out: put_out: put_page(old_page); + if (unlikely(ret == -EAGAIN)) + goto retry; return ret; } -- cgit v1.2.3-59-g8ed1b From c1914a0936f79ed0236f670122e06e36e4d332ee Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:31 +0200 Subject: uprobes: Install_breakpoint() should fail if is_swbp_insn() == T install_breakpoint() returns -EEXIST if is_swbp_insn(orig_insn) == T, the caller treats this code as success. This is doubly wrong. The successful return should set UPROBE_COPY_INSN, but the real problem is that it shouldn't succeed. If the probed insn is int3 the application should get SIGTRAP, this won't happen with uprobe. Probably we can fix this, we can add the UPROBE_SHARED_BP flag and teach handle_swbp/set_orig_insn to handle this case correctly. But this needs some complications and we have other insns which can't be probed, lets make a simple fix for now. I think this needs a cleanup. UPROBE_COPY_INSN should die, copy_insn() should be called by alloc_uprobe(). arch_uprobe_analyze_insn() depends on ->mm (ia32_compat) but it is called only once. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154331.GA9578@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3ccdb29ee8d6..ec78152e32e9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -693,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, return ret; if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) - return -EEXIST; + return -ENOTSUPP; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); if (ret) -- cgit v1.2.3-59-g8ed1b From 268720903f87e0b84b161626c4447b81671b5d18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:33 +0200 Subject: uprobes: Rework register_for_each_vma() to make it O(n) Currently register_for_each_vma() is O(n ** 2) + O(n ** 3), every time find_next_vma_info() "restarts" the vma_prio_tree_foreach() loop and each iteration rechecks the whole try_list. This also means that try_list can grow "indefinitely" if register/unregister races with munmap/mmap activity even if the number of mapping is bounded at any time. With this patch register_for_each_vma() builds the list of mm/vaddr structures only once and does install_breakpoint() for each entry. We do not care about the new mappings which can be created after build_map_info() drops mapping->i_mmap_mutex, uprobe_mmap() should do its work. Note that we do not allocate map_info under i_mmap_mutex, this can deadlock with page reclaim (but see the next patch). So we use 2 lists, "curr" which we are going to return, and "prev" which holds the already allocated memory. The main loop deques the entry from "prev" (initially it is empty), and if "prev" becomes empty again it counts the number of entries we need to pre-allocate outside of i_mmap_mutex. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154333.GA9581@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 199 +++++++++++++++++++++--------------------------- 1 file changed, 86 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ec78152e32e9..4e0db3496d70 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,17 +60,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; */ static atomic_t uprobe_events = ATOMIC_INIT(0); -/* - * Maintain a temporary per vma info that can be used to search if a vma - * has already been handled. This structure is introduced since extending - * vm_area_struct wasnt recommended. - */ -struct vma_info { - struct list_head probe_list; - struct mm_struct *mm; - loff_t vaddr; -}; - struct uprobe { struct rb_node rb_node; /* node in the rb tree */ atomic_t ref; @@ -742,139 +731,123 @@ static void delete_uprobe(struct uprobe *uprobe) atomic_dec(&uprobe_events); } -static struct vma_info * -__find_next_vma_info(struct address_space *mapping, struct list_head *head, - struct vma_info *vi, loff_t offset, bool is_register) +struct map_info { + struct map_info *next; + struct mm_struct *mm; + loff_t vaddr; +}; + +static inline struct map_info *free_map_info(struct map_info *info) { + struct map_info *next = info->next; + kfree(info); + return next; +} + +static struct map_info * +build_map_info(struct address_space *mapping, loff_t offset, bool is_register) +{ + unsigned long pgoff = offset >> PAGE_SHIFT; struct prio_tree_iter iter; struct vm_area_struct *vma; - struct vma_info *tmpvi; - unsigned long pgoff; - int existing_vma; - loff_t vaddr; - - pgoff = offset >> PAGE_SHIFT; + struct map_info *curr = NULL; + struct map_info *prev = NULL; + struct map_info *info; + int more = 0; + again: + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; - existing_vma = 0; - vaddr = vma_address(vma, offset); - - list_for_each_entry(tmpvi, head, probe_list) { - if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) { - existing_vma = 1; - break; - } - } - - /* - * Another vma needs a probe to be installed. However skip - * installing the probe if the vma is about to be unlinked. - */ - if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) { - vi->mm = vma->vm_mm; - vi->vaddr = vaddr; - list_add(&vi->probe_list, head); - - return vi; + if (!prev) { + more++; + continue; } - } - - return NULL; -} -/* - * Iterate in the rmap prio tree and find a vma where a probe has not - * yet been inserted. - */ -static struct vma_info * -find_next_vma_info(struct address_space *mapping, struct list_head *head, - loff_t offset, bool is_register) -{ - struct vma_info *vi, *retvi; + if (!atomic_inc_not_zero(&vma->vm_mm->mm_users)) + continue; - vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL); - if (!vi) - return ERR_PTR(-ENOMEM); + info = prev; + prev = prev->next; + info->next = curr; + curr = info; - mutex_lock(&mapping->i_mmap_mutex); - retvi = __find_next_vma_info(mapping, head, vi, offset, is_register); + info->mm = vma->vm_mm; + info->vaddr = vma_address(vma, offset); + } mutex_unlock(&mapping->i_mmap_mutex); - if (!retvi) - kfree(vi); + if (!more) + goto out; + + prev = curr; + while (curr) { + mmput(curr->mm); + curr = curr->next; + } - return retvi; + do { + info = kmalloc(sizeof(struct map_info), GFP_KERNEL); + if (!info) { + curr = ERR_PTR(-ENOMEM); + goto out; + } + info->next = prev; + prev = info; + } while (--more); + + goto again; + out: + while (prev) + prev = free_map_info(prev); + return curr; } static int register_for_each_vma(struct uprobe *uprobe, bool is_register) { - struct list_head try_list; - struct vm_area_struct *vma; - struct address_space *mapping; - struct vma_info *vi, *tmpvi; - struct mm_struct *mm; - loff_t vaddr; - int ret; + struct map_info *info; + int err = 0; - mapping = uprobe->inode->i_mapping; - INIT_LIST_HEAD(&try_list); - - ret = 0; + info = build_map_info(uprobe->inode->i_mapping, + uprobe->offset, is_register); + if (IS_ERR(info)) + return PTR_ERR(info); - for (;;) { - vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register); - if (!vi) - break; + while (info) { + struct mm_struct *mm = info->mm; + struct vm_area_struct *vma; + loff_t vaddr; - if (IS_ERR(vi)) { - ret = PTR_ERR(vi); - break; - } + if (err) + goto free; - mm = vi->mm; down_write(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)vi->vaddr); - if (!vma || !valid_vma(vma, is_register)) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vma = find_vma(mm, (unsigned long)info->vaddr); + if (!vma || !valid_vma(vma, is_register)) + goto unlock; + vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != vi->vaddr) { - list_del(&vi->probe_list); - kfree(vi); - up_write(&mm->mmap_sem); - mmput(mm); - continue; - } + vaddr != info->vaddr) + goto unlock; - if (is_register) - ret = install_breakpoint(uprobe, mm, vma, vi->vaddr); - else - remove_breakpoint(uprobe, mm, vi->vaddr); - - up_write(&mm->mmap_sem); - mmput(mm); if (is_register) { - if (ret && ret == -EEXIST) - ret = 0; - if (ret) - break; + err = install_breakpoint(uprobe, mm, vma, info->vaddr); + if (err == -EEXIST) + err = 0; + } else { + remove_breakpoint(uprobe, mm, info->vaddr); } + unlock: + up_write(&mm->mmap_sem); + free: + mmput(mm); + info = free_map_info(info); } - list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) { - list_del(&vi->probe_list); - kfree(vi); - } - - return ret; + return err; } static int __uprobe_register(struct uprobe *uprobe) -- cgit v1.2.3-59-g8ed1b From 7a5bfb66b07f22d2429db776da7bb8b57bfb5cff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:36 +0200 Subject: uprobes: Change build_map_info() to try kmalloc(GFP_NOWAIT) first build_map_info() doesn't allocate the memory under i_mmap_mutex to avoid the deadlock with page reclaim. But it can try GFP_NOWAIT first, it should work in the likely case and thus we almost never need the pre-alloc-and-retry path. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Peter Zijlstra Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120615154336.GA9588@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e0db3496d70..897417dbca8e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -761,6 +761,16 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) if (!valid_vma(vma, is_register)) continue; + if (!prev && !more) { + /* + * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * reclaim. This is optimistic, no harm done if it fails. + */ + prev = kmalloc(sizeof(struct map_info), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (prev) + prev->next = NULL; + } if (!prev) { more++; continue; -- cgit v1.2.3-59-g8ed1b From c5784de2b351fe871bb57487878f7fc7ec5b075c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jun 2012 17:43:39 +0200 Subject: uprobes: Document uprobe_register() vs uprobe_mmap() race Because the mind is treacherous and makes us forget we need to write stuff down. Signed-off-by: Peter Zijlstra Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154339.GA9591@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 897417dbca8e..2671d9ad49be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -44,6 +44,23 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 +/* + * We need separate register/unregister and mmap/munmap lock hashes because + * of mmap_sem nesting. + * + * uprobe_register() needs to install probes on (potentially) all processes + * and thus needs to acquire multiple mmap_sems (consequtively, not + * concurrently), whereas uprobe_mmap() is called while holding mmap_sem + * for the particular process doing the mmap. + * + * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem + * because of lock order against i_mmap_mutex. This means there's a hole in + * the register vma iteration where a mmap() can happen. + * + * Thus uprobe_register() can race with uprobe_mmap() and we can try and + * install a probe where one is already installed. + */ + /* serialize (un)register */ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; @@ -339,7 +356,9 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { int result; - + /* + * See the comment near uprobes_hash(). + */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) return -EEXIST; @@ -845,6 +864,10 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (is_register) { err = install_breakpoint(uprobe, mm, vma, info->vaddr); + /* + * We can race against uprobe_mmap(), see the + * comment near uprobe_hash(). + */ if (err == -EEXIST) err = 0; } else { @@ -1054,8 +1077,10 @@ int uprobe_mmap(struct vm_area_struct *vma) } ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - - /* Ignore double add: */ + /* + * We can race against uprobe_register(), see the + * comment near uprobe_hash(). + */ if (ret == -EEXIST) { ret = 0; -- cgit v1.2.3-59-g8ed1b From d436615e60c386095dac4a9bf72b08868d2a7564 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:42 +0200 Subject: uprobes: Copy_insn() shouldn't depend on mm/vma/vaddr 1. copy_insn() doesn't need "addr", it can use uprobe->offset. Remove this argument. 2. Change copy_insn/__copy_insn to accept "struct file*" instead of vma. copy_insn() is called only once and mm/vma/vaddr are random, it shouldn't depend on them. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154342.GA9598@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2671d9ad49be..08ef566da763 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -591,10 +591,9 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) } static int -__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn, +__copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, unsigned long offset) { - struct file *filp = vma->vm_file; struct page *page; void *vaddr; unsigned long off1; @@ -625,15 +624,13 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins return 0; } -static int -copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) +static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping; unsigned long nbytes; int bytes; - addr &= ~PAGE_MASK; - nbytes = PAGE_SIZE - addr; + nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); mapping = uprobe->inode->i_mapping; /* Instruction at end of binary; copy only available bytes */ @@ -644,13 +641,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes, + if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, bytes - nbytes, uprobe->offset + nbytes)) return -ENOMEM; bytes = nbytes; } - return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset); + return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); } /* @@ -696,7 +693,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, addr = (unsigned long)vaddr; if (!(uprobe->flags & UPROBE_COPY_INSN)) { - ret = copy_insn(uprobe, vma, addr); + ret = copy_insn(uprobe, vma->vm_file); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From fc36f59565861af2e897225bc3782479a26c5d5a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:44 +0200 Subject: uprobes: Copy_insn() should not return -ENOMEM if __copy_insn() fails copy_insn() returns -ENOMEM if the first __copy_insn() fails, it should return the correct error code. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154344.GA9601@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 08ef566da763..2db1d94d7dfc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -641,10 +641,10 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) /* Instruction at the page-boundary; copy bytes in second page */ if (nbytes < bytes) { - if (__copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes)) - return -ENOMEM; - + int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, + bytes - nbytes, uprobe->offset + nbytes); + if (err) + return err; bytes = nbytes; } return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); -- cgit v1.2.3-59-g8ed1b From eb2bf57bee42c7565032f93adaa211e2c9fcc52c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:47 +0200 Subject: uprobes: No need to re-check vma_address() in write_opcode() write_opcode() is called by register_for_each_vma() and uprobe_mmap() paths. In both cases the caller has already verified this vaddr under mmap_sem, no need to re-check. Note also that this check is wrong anyway, we should not truncate loff_t returned by vma_address() if we do not trust this mapping. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154347.GA9604@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2db1d94d7dfc..14c71a2aadad 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -211,7 +211,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; struct uprobe *uprobe; unsigned long pgoff; - loff_t addr; int ret; retry: /* Read the page with vaddr into memory */ @@ -235,10 +234,6 @@ retry: if (mapping != vma->vm_file->f_mapping) goto put_out; - addr = vma_address(vma, uprobe->offset); - if (vaddr != (unsigned long)addr) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) -- cgit v1.2.3-59-g8ed1b From d9c4a30e82614d43b55893a73f31e7284007ce82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:50 +0200 Subject: uprobes: Move BUG_ON(UPROBE_SWBP_INSN_SIZE) from write_opcode() to install_breakpoint() write_opcode() ensures that UPROBE_SWBP_INSN doesn't cross the page boundary. This looks a bit confusing, the check does not depend on vaddr and it is enough to do it only once right after install_breakpoint()->arch_uprobe_analyze_insn(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154350.GA9611@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 14c71a2aadad..b9c61bda9029 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -210,7 +210,6 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; struct uprobe *uprobe; - unsigned long pgoff; int ret; retry: /* Read the page with vaddr into memory */ @@ -251,11 +250,7 @@ retry: vaddr_new = kmap_atomic(new_page); memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - - /* poke the new insn in, ASSUMES we don't cross page boundary */ - pgoff = (vaddr & ~PAGE_MASK); - BUG_ON(pgoff + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - memcpy(vaddr_new + pgoff, &opcode, UPROBE_SWBP_INSN_SIZE); + memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); kunmap_atomic(vaddr_old); @@ -699,6 +694,10 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (ret) return ret; + /* write_opcode() assumes we don't cross page boundary */ + BUG_ON((uprobe->offset & ~PAGE_MASK) + + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + uprobe->flags |= UPROBE_COPY_INSN; } -- cgit v1.2.3-59-g8ed1b From 449d0d7c9fb87277175db34c009c70cb348004a8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:53 +0200 Subject: uprobes: Simplify the usage of uprobe->pending_list uprobe->pending_list is only used to create the temporary list, it has no meaning after we drop uprobes_mmap_hash(inode). No need to initialize this node or remove it from tmp_list, and we can use list_for_each_entry(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154353.GA9614@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b9c61bda9029..7d5c78f063ae 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -513,7 +513,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->inode = igrab(inode); uprobe->offset = offset; init_rwsem(&uprobe->consumer_rwsem); - INIT_LIST_HEAD(&uprobe->pending_list); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); @@ -1037,7 +1036,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; int ret, count; @@ -1055,10 +1054,9 @@ int uprobe_mmap(struct vm_area_struct *vma) ret = 0; count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); if (!ret) { vaddr = vma_address(vma, uprobe->offset); @@ -1106,7 +1104,7 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe, *u; + struct uprobe *uprobe; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) @@ -1123,12 +1121,10 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { + list_for_each_entry(uprobe, &tmp_list, pending_list) { loff_t vaddr; - list_del(&uprobe->pending_list); vaddr = vma_address(vma, uprobe->offset); - if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before -- cgit v1.2.3-59-g8ed1b From 816c03fbabe64fa09f66fbb64e932081af381415 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:55 +0200 Subject: uprobes: Don't use loff_t for the valid virtual address loff_t looks confusing when it is used for the virtual address. Change map_info and install_breakpoint/remove_breakpoint paths to use "unsigned long". The patch doesn't change vma_address(), it can't return "long" because it is used to verify the mapping. But probably this needs some cleanups too. Signed-off-by: Oleg Nesterov Signed-off-by: Anton Arapov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154355.GA9622@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7d5c78f063ae..4df84b76dd48 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -664,9 +664,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) */ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, - struct vm_area_struct *vma, loff_t vaddr) + struct vm_area_struct *vma, unsigned long vaddr) { - unsigned long addr; int ret; /* @@ -679,8 +678,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (!uprobe->consumers) return -EEXIST; - addr = (unsigned long)vaddr; - if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); if (ret) @@ -689,7 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) return -ENOTSUPP; - ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, addr); + ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) return ret; @@ -709,7 +706,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence increment before and decrement on failure. */ atomic_inc(&mm->uprobes_state.count); - ret = set_swbp(&uprobe->arch, mm, addr); + ret = set_swbp(&uprobe->arch, mm, vaddr); if (ret) atomic_dec(&mm->uprobes_state.count); @@ -717,9 +714,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, } static void -remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr) +remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true)) + if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) atomic_dec(&mm->uprobes_state.count); } @@ -743,7 +740,7 @@ static void delete_uprobe(struct uprobe *uprobe) struct map_info { struct map_info *next; struct mm_struct *mm; - loff_t vaddr; + unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) @@ -837,7 +834,6 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; - loff_t vaddr; if (err) goto free; @@ -847,9 +843,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) if (!vma || !valid_vma(vma, is_register)) goto unlock; - vaddr = vma_address(vma, uprobe->offset); if (vma->vm_file->f_mapping->host != uprobe->inode || - vaddr != info->vaddr) + vma_address(vma, uprobe->offset) != info->vaddr) goto unlock; if (is_register) { @@ -1055,10 +1050,8 @@ int uprobe_mmap(struct vm_area_struct *vma) count = 0; list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; - if (!ret) { - vaddr = vma_address(vma, uprobe->offset); + loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { put_uprobe(uprobe); @@ -1122,9 +1115,8 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, &tmp_list); list_for_each_entry(uprobe, &tmp_list, pending_list) { - loff_t vaddr; + loff_t vaddr = vma_address(vma, uprobe->offset); - vaddr = vma_address(vma, uprobe->offset); if (vaddr >= start && vaddr < end) { /* * An unregister could have removed the probe before -- cgit v1.2.3-59-g8ed1b From 593609a59600c8377f311b300f14deacb155b9a4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:43:59 +0200 Subject: uprobes: __copy_insn() needs "loff_t offset" 1. __copy_insn() needs "loff_t offset", not "unsigned long", to read the file. 2. use pgoff_t for "idx" and remove the unnecessary typecast. 3. fix the typo, "&=" is not what we want 4. can't resist, rename off1 to off. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Acked-by: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120615154359.GA9625@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4df84b76dd48..d1b2eeb80837 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -581,12 +581,12 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) static int __copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, unsigned long offset) + unsigned long nbytes, loff_t offset) { struct page *page; void *vaddr; - unsigned long off1; - unsigned long idx; + unsigned long off; + pgoff_t idx; if (!filp) return -EINVAL; @@ -594,8 +594,8 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, if (!mapping->a_ops->readpage) return -EIO; - idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT); - off1 = offset &= ~PAGE_MASK; + idx = offset >> PAGE_CACHE_SHIFT; + off = offset & ~PAGE_MASK; /* * Ensure that the page that has the original instruction is @@ -606,7 +606,7 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, return PTR_ERR(page); vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off1, nbytes); + memcpy(insn, vaddr + off, nbytes); kunmap_atomic(vaddr); page_cache_release(page); -- cgit v1.2.3-59-g8ed1b From e227051b13956b8f71c0abecc41ad351e64671c8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 15 Jun 2012 17:44:01 +0200 Subject: uprobes: Remove the unnecessary initialization in add_utask() Trivial cleanup. No need to nullify ->active_uprobe after kzalloc(). Signed-off-by: Oleg Nesterov Cc: Ananth N Mavinakayanahalli Cc: Anton Arapov Cc: Peter Zijlstra Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120615154401.GA9633@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1b2eeb80837..f93532748bca 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1392,7 +1392,6 @@ static struct uprobe_task *add_utask(void) if (unlikely(!utask)) return NULL; - utask->active_uprobe = NULL; current->utask = utask; return utask; } -- cgit v1.2.3-59-g8ed1b From fbfc623f8231c8d8c78aab5841e9c6e5811ab638 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:31 +0800 Subject: perf: Avoid race between cpu hotplug and installing event perf_event_open() requires the cpu on which to install event is online, but the cpu can go offline after perf_event_open checks that. Add a get_online_cpus()/put_online_cpus() pair to avoid the race. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-3-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7d71d6ec972..31d182e01549 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6252,6 +6252,8 @@ SYSCALL_DEFINE5(perf_event_open, } } + get_online_cpus(); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { @@ -6391,6 +6393,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + put_online_cpus(); + event->owner = current; mutex_lock(¤t->perf_event_mutex); @@ -6419,6 +6423,7 @@ err_context: err_alloc: free_event(event); err_task: + put_online_cpus(); if (task) put_task_struct(task); err_group_fd: -- cgit v1.2.3-59-g8ed1b From e2d37cd213dcc0aeb3db4b37b9bd1710fe36fbf7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:32 +0800 Subject: perf: Allow the PMU driver to choose the CPU on which to install events Allow the pmu->event_init callback to change event->cpu, so the PMU driver can choose the CPU on which to install events. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-4-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 31d182e01549..fa36a39e8bb7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6306,7 +6306,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, cpu); + ctx = find_get_context(pmu, task, event->cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -6379,16 +6379,16 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { - perf_install_in_context(ctx, group_leader, cpu); + perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, cpu); + perf_install_in_context(ctx, sibling, event->cpu); get_ctx(ctx); } } - perf_install_in_context(ctx, event, cpu); + perf_install_in_context(ctx, event, event->cpu); ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); -- cgit v1.2.3-59-g8ed1b From 0cda4c023132aa93f2dd94811061f812e88daf4c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 15 Jun 2012 14:31:33 +0800 Subject: perf: Introduce perf_pmu_migrate_context() Originally from Peter Zijlstra. The helper migrates perf events from one cpu to another cpu. Signed-off-by: Zheng Yan Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1339741902-8449-5-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1ce887abcc5c..76c5c8b724a7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1107,6 +1107,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, struct task_struct *task, perf_overflow_handler_t callback, void *context); +extern void perf_pmu_migrate_context(struct pmu *pmu, + int src_cpu, int dst_cpu); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); diff --git a/kernel/events/core.c b/kernel/events/core.c index fa36a39e8bb7..f1cf0edeb39a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); event->ctx = ctx; + if (event->cpu != -1) + event->cpu = cpu; if (!task) { /* @@ -6379,6 +6381,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); if (move_group) { + synchronize_rcu(); perf_install_in_context(ctx, group_leader, event->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -6484,6 +6487,39 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx; + struct perf_event_context *dst_ctx; + struct perf_event *event, *tmp; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + + mutex_lock(&src_ctx->mutex); + list_for_each_entry_safe(event, tmp, &src_ctx->event_list, + event_entry) { + perf_remove_from_context(event); + put_ctx(src_ctx); + list_add(&event->event_entry, &events); + } + mutex_unlock(&src_ctx->mutex); + + synchronize_rcu(); + + mutex_lock(&dst_ctx->mutex); + list_for_each_entry_safe(event, tmp, &events, event_entry) { + list_del(&event->event_entry); + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + perf_install_in_context(dst_ctx, event, dst_cpu); + get_ctx(dst_ctx); + } + mutex_unlock(&dst_ctx->mutex); +} +EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); + static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { -- cgit v1.2.3-59-g8ed1b From 0be61ebc18b919dddbdbcd1c4f42513c310ecf59 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 18 Jun 2012 09:28:16 -0400 Subject: tracing/selftest: Add a WARN_ON() if a tracer test fails Add a WARN_ON() output on test failures so that they are easier to detect in automated tests. Although, the WARN_ON() will not print if the test causes the system to crash, obviously. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690d..748f6401edf6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -830,6 +830,8 @@ int register_tracer(struct tracer *type) current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ -- cgit v1.2.3-59-g8ed1b From 6d158a813efcd09661c23f16ddf7e2ff834cb20c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Jun 2012 20:46:14 -0400 Subject: tracing: Remove NR_CPUS array from trace_iterator Replace the NR_CPUS array of buffer_iter from the trace_iterator with an allocated array. This will just create an array of possible CPUS instead of the max number specified. The use of NR_CPUS in that array caused allocation failures for machines that were tight on memory. This did not cause any failures to the system itself (no crashes), but caused unnecessary failures for reading the trace files. Added a helper function called 'trace_buffer_iter()' that returns the buffer_iter item or NULL if it is not defined or the array was not allocated. Some routines do not require the array (tracing_open_pipe() for one). Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.c | 27 ++++++++++++++++++--------- kernel/trace/trace.h | 8 ++++++++ kernel/trace/trace_functions_graph.c | 2 +- 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 1aff18346c71..af961d6f7ab1 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -65,7 +65,7 @@ struct trace_iterator { void *private; int cpu_file; struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; + struct ring_buffer_iter **buffer_iter; unsigned long iter_flags; /* trace_seq for __print_flags() and __print_symbolic() etc. */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 748f6401edf6..b2af14e94c28 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1710,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu); + iter->idx++; - if (iter->buffer_iter[iter->cpu]) - ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + if (buf_iter) + ring_buffer_read(buf_iter, NULL); } static struct trace_entry * @@ -1720,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) { struct ring_buffer_event *event; - struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); @@ -1858,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) tr->data[cpu]->skipped_entries = 0; - if (!iter->buffer_iter[cpu]) + buf_iter = trace_buffer_iter(iter, cpu); + if (!buf_iter) return; - buf_iter = iter->buffer_iter[cpu]; ring_buffer_iter_reset(buf_iter); /* @@ -2207,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) int trace_empty(struct trace_iterator *iter) { + struct ring_buffer_iter *buf_iter; int cpu; /* If we are looking at one CPU buffer, only check that one */ if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { cpu = iter->cpu_file; - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2223,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter) } for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) { - if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + buf_iter = trace_buffer_iter(iter, cpu); + if (buf_iter) { + if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) @@ -2383,6 +2388,8 @@ __tracing_open(struct inode *inode, struct file *file) if (!iter) return ERR_PTR(-ENOMEM); + iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), + GFP_KERNEL); /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2443,6 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file) fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } @@ -2483,6 +2491,7 @@ static int tracing_release(struct inode *inode, struct file *file) mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); + kfree(iter->buffer_iter); seq_release_private(inode, file); return 0; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5aec220d2de0..55e1f7f0db12 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -317,6 +317,14 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 +static inline struct ring_buffer_iter * +trace_buffer_iter(struct trace_iterator *iter, int cpu) +{ + if (iter->buffer_iter && iter->buffer_iter[cpu]) + return iter->buffer_iter[cpu]; + return NULL; +} + int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a7d2a4c653d8..ce27c8ba8d31 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter, next = &data->ret; } else { - ring_iter = iter->buffer_iter[iter->cpu]; + ring_iter = trace_buffer_iter(iter, iter->cpu); /* First peek to compare current entry and the next one */ if (ring_iter) -- cgit v1.2.3-59-g8ed1b From a5fb833172eca69136e9ee1ada778e404086ab8a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Jun 2012 13:35:04 -0400 Subject: ring-buffer: Fix uninitialized read_stamp The ring buffer reader page is used to swap a page from the writable ring buffer. If the writer happens to be on that page, it ends up on the reader page, but will simply move off of it, back into the writable ring buffer as writes are added. The time stamp passed back to the readers is stored in the cpu_buffer per CPU descriptor. This stamp is updated when a swap of the reader page takes place, and it reads the current stamp from the page taken from the writable ring buffer. Everytime a writer goes to a new page, it updates the time stamp of that page. The problem happens if a reader reads a page from an empty per CPU ring buffer. If the buffer is empty, the swap still takes place, placing the writer at the start of the reader page. If at a later time, a write happens, it updates the page's time stamp and continues. But the problem is that the read_stamp does not get updated, because the page was already swapped. The solution to this was to not swap the page if the ring buffer happens to be empty. This also removes the side effect that the writes on the reader page will not get updated because the writer never gets back on the reader page without a swap. That is, if a read happens on an empty buffer, but then no reads happen for a while. If a swap took place, and the writer were to start writing a lot of data (function tracer), it will start overflowing the ring buffer and overwrite the older data. But because the writer never goes back onto the reader page, the data left on the reader page never gets overwritten. This causes the reader to see really old data, followed by a jump to newer data. Link: http://lkml.kernel.org/r/1340060577-9112-1-git-send-email-dhsharp@google.com Google-Bug-Id: 6410455 Reported-by: David Sharp tested-by: David Sharp Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5e..82a3e0c56b1d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->commit_page == cpu_buffer->reader_page) goto out; + /* Don't bother swapping if the ring buffer is empty */ + if (rb_num_of_entries(cpu_buffer) == 0) + goto out; + /* * Reset the reader page to size zero. */ -- cgit v1.2.3-59-g8ed1b From 93574fcc5b50cc7b8834698acb2ce947e5b6a5dc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Jul 2012 09:35:08 +0300 Subject: tracing: Check for allocation failure in __tracing_open() Clean up and return -ENOMEM on if the kzalloc() fails. This also prevents a potential crash, as the pointer that failed to allocate would be later used. Link: http://lkml.kernel.org/r/20120711063507.GF11812@elgon.mountain Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 814ff306ae74..a120f98c4112 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2390,6 +2390,9 @@ __tracing_open(struct inode *inode, struct file *file) iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), GFP_KERNEL); + if (!iter->buffer_iter) + goto release; + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -2451,6 +2454,7 @@ __tracing_open(struct inode *inode, struct file *file) mutex_unlock(&trace_types_lock); kfree(iter->trace); kfree(iter->buffer_iter); +release: seq_release_private(inode, file); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3-59-g8ed1b