From f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Mar 2019 04:12:00 +0100 Subject: x86/unwind: Handle NULL pointer calls better in frame unwinder When the frame unwinder is invoked for an oops caused by a call to NULL, it currently skips the parent function because BP still points to the parent's stack frame; the (nonexistent) current function only has the first half of a stack frame, and BP doesn't point to it yet. Add a special case for IP==0 that calculates a fake BP from SP, then uses the real BP for the next frame. Note that this handles first_frame specially: Return information about the parent function as long as the saved IP is >=first_frame, even if the fake BP points below it. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace is: Call Trace: prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-1-jannh@google.com --- arch/x86/include/asm/unwind.h | 6 ++++++ arch/x86/kernel/unwind_frame.c | 25 ++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 1f86e1b0a5cd..499578f7e6d7 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -23,6 +23,12 @@ struct unwind_state { #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; + /* + * If non-NULL: The current frame is incomplete and doesn't contain a + * valid BP. When looking for the next frame, use this instead of the + * non-existent saved BP. + */ + unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3dc26f95d46e..9b9fd4826e7a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state) } /* Get the next frame pointer: */ - if (state->regs) + if (state->next_bp) { + next_bp = state->next_bp; + state->next_bp = NULL; + } else if (state->regs) { next_bp = (unsigned long *)state->regs->bp; - else + } else { next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + } /* Move to the next frame if it's safe: */ if (!update_stack_state(state, next_bp)) @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, bp = get_frame_pointer(task, regs); + /* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer. + * That means that SP points into the middle of an incomplete frame: + * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we + * would have written a frame pointer if we hadn't crashed. + * Pretend that the frame is complete and that BP points to it, but save + * the real BP so that we can use it when looking for the next frame. + */ + if (regs && regs->ip == 0 && + (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + state->next_bp = bp; + bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + } + /* Initialize stack info and make sure the frame data is accessible: */ get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); @@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, */ while (!unwind_done(state) && (!on_stack(&state->stack_info, first_frame, sizeof(long)) || - state->bp < first_frame)) + (state->next_bp == NULL && state->bp < first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start); -- cgit v1.2.3-59-g8ed1b From ac5ceccce5501e43d217c596e4ee859f2a3fef79 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 1 Mar 2019 04:12:01 +0100 Subject: x86/unwind: Add hardcoded ORC entry for NULL When the ORC unwinder is invoked for an oops caused by IP==0, it currently has no idea what to do because there is no debug information for the stack frame of NULL. But if RIP is NULL, it is very likely that the last successfully executed instruction was an indirect CALL/JMP, and it is possible to unwind out in the same way as for the first instruction of a normal function. Hardcode a corresponding ORC entry. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 ? do_syscall_64+0x6d/0x160 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace looks like this: Call Trace: __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 do_syscall_64+0x6d/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 prctl_set_seccomp() still doesn't show up in the trace because for some reason, tail call optimization is only disabled in builds that use the frame pointer unwinder. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-2-jannh@google.com --- arch/x86/kernel/unwind_orc.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 26038eacf74a..89be1be1790c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip) } #endif +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry null_orc_entry = { + .sp_offset = sizeof(long), + .sp_reg = ORC_REG_SP, + .bp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; @@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip) if (!orc_init) return NULL; + if (ip == 0) + return &null_orc_entry; + /* For non-init vmlinux addresses, use the fast lookup table: */ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { unsigned int idx, start, stop; -- cgit v1.2.3-59-g8ed1b From a2863b53418d7d8f6332adf0cfb32611def0c4b9 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 6 Mar 2019 16:38:06 -0500 Subject: Revert "x86_64: Increase stack size for KASAN_EXTRA" This reverts commit a8e911d13540487942d53137c156bd7707f66e5d. KASAN_EXTRA was removed via the commit 7771bdbbfd3d ("kasan: remove use after scope bugs detection."), so this is no longer needed. Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: bp@alien8.de Cc: akpm@linux-foundation.org Cc: aryabinin@virtuozzo.com Cc: glider@google.com Cc: dvyukov@google.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/20190306213806.46139-1-cai@lca.pw --- arch/x86/include/asm/page_64_types.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 0ce558a8150d..8f657286d599 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -7,11 +7,7 @@ #endif #ifdef CONFIG_KASAN -#ifdef CONFIG_KASAN_EXTRA -#define KASAN_STACK_ORDER 2 -#else #define KASAN_STACK_ORDER 1 -#endif #else #define KASAN_STACK_ORDER 0 #endif -- cgit v1.2.3-59-g8ed1b From 3609e31bc8dc03b701390f79c74fc7fe92b95039 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 28 Feb 2019 17:01:55 -0500 Subject: x86/mm: Remove unused variable 'cpu' The commit a2055abe9c67 ("x86/mm: Pass flush_tlb_info to flush_tlb_others() etc") removed the unnecessary cpu parameter from uv_flush_tlb_others() but left an unused variable. arch/x86/mm/tlb.c: In function 'native_flush_tlb_others': arch/x86/mm/tlb.c:688:16: warning: variable 'cpu' set but not used [-Wunused-but-set-variable] unsigned int cpu; ^~~ Fixes: a2055abe9c67 ("x86/mm: Pass flush_tlb_info to flush_tlb_others() etc") Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Acked-by: Andyt Lutomirski Cc: dave.hansen@linux.intel.com Cc: peterz@infradead.org Cc: bp@alien8.de Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/20190228220155.88124-1-cai@lca.pw --- arch/x86/mm/tlb.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 999d6d8f0bef..bc4bc7b2f075 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ - unsigned int cpu; - - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) smp_call_function_many(cpumask, flush_tlb_func_remote, -- cgit v1.2.3-59-g8ed1b From 24c41220659ecc5576c34c6f23537f8d3949fb05 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Fri, 1 Mar 2019 10:29:24 -0500 Subject: x86/mm: Remove unused variable 'old_pte' The commit 3a19109efbfa ("x86/mm: Fix try_preserve_large_page() to handle large PAT bit") fixed try_preserve_large_page() by using the corresponding pud/pmd prot/pfn interfaces, but left a variable unused because it no longer used pte_pfn(). Later, the commit 8679de0959e6 ("x86/mm/cpa: Split, rename and clean up try_preserve_large_page()") renamed try_preserve_large_page() to __should_split_large_page(), but the unused variable remains. arch/x86/mm/pageattr.c: In function '__should_split_large_page': arch/x86/mm/pageattr.c:741:17: warning: variable 'old_pte' set but not used [-Wunused-but-set-variable] Fixes: 3a19109efbfa ("x86/mm: Fix try_preserve_large_page() to handle large PAT bit") Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: toshi.kani@hpe.com Cc: bp@alien8.de Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/20190301152924.94762-1-cai@lca.pw --- arch/x86/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e6119838a6..4c570612e24e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -738,7 +738,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; - pte_t new_pte, old_pte, *tmp; + pte_t new_pte, *tmp; enum pg_level level; /* @@ -781,7 +781,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ - old_pte = *kpte; + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); -- cgit v1.2.3-59-g8ed1b From 179fb36abb097976997f50733d5b122a29158cba Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 6 Mar 2019 19:18:27 +0800 Subject: x86/hyperv: Fix kernel panic when kexec on HyperV After commit 68bb7bfb7985 ("X86/Hyper-V: Enable IPI enlightenments"), kexec fails with a kernel panic: kexec_core: Starting new kernel BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v3.0 03/02/2018 RIP: 0010:0xffffc9000001d000 Call Trace: ? __send_ipi_mask+0x1c6/0x2d0 ? hv_send_ipi_mask_allbutself+0x6d/0xb0 ? mp_save_irq+0x70/0x70 ? __ioapic_read_entry+0x32/0x50 ? ioapic_read_entry+0x39/0x50 ? clear_IO_APIC_pin+0xb8/0x110 ? native_stop_other_cpus+0x6e/0x170 ? native_machine_shutdown+0x22/0x40 ? kernel_kexec+0x136/0x156 That happens if hypercall based IPIs are used because the hypercall page is reset very early upon kexec reboot, but kexec sends IPIs to stop CPUs, which invokes the hypercall and dereferences the unusable page. To fix his, reset hv_hypercall_pg to NULL before the page is reset to avoid any misuse, IPI sending will fall back to the non hypercall based method. This only happens on kexec / kdump so just setting the pointer to NULL is good enough. Fixes: 68bb7bfb7985 ("X86/Hyper-V: Enable IPI enlightenments") Signed-off-by: Kairui Song Signed-off-by: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Vitaly Kuznetsov Cc: Dave Young Cc: devel@linuxdriverproject.org Link: https://lkml.kernel.org/r/20190306111827.14131-1-kasong@redhat.com --- arch/x86/hyperv/hv_init.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 7abb09e2eeb8..d3f42b6bbdac 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -406,6 +406,13 @@ void hyperv_cleanup(void) /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + /* + * Reset hypercall page reference before reset the page, + * let hypercall operations fail safely rather than + * panic the kernel for using invalid hypercall page + */ + hv_hypercall_pg = NULL; + /* Reset the hypercall page */ hypercall_msr.as_uint64 = 0; wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); -- cgit v1.2.3-59-g8ed1b