From 35d0e6fb4d219d64ab3b7cffef7a11a0662140f5 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 1 Aug 2017 15:35:53 +0100 Subject: arm64: syscallno is secretly an int, make it official The upper 32 bits of the syscallno field in thread_struct are handled inconsistently, being sometimes zero extended and sometimes sign-extended. In fact, only the lower 32 bits seem to have any real significance for the behaviour of the code: it's been OK to handle the upper bits inconsistently because they don't matter. Currently, the only place I can find where those bits are significant is in calling trace_sys_enter(), which may be unintentional: for example, if a compat tracer attempts to cancel a syscall by passing -1 to (COMPAT_)PTRACE_SET_SYSCALL at the syscall-enter-stop, it will be traced as syscall 4294967295 rather than -1 as might be expected (and as occurs for a native tracer doing the same thing). Elsewhere, reads of syscallno cast it to an int or truncate it. There's also a conspicuous amount of code and casting to bodge around the fact that although semantically an int, syscallno is stored as a u64. Let's not pretend any more. In order to preserve the stp x instruction that stores the syscall number in entry.S, this patch special-cases the layout of struct pt_regs for big endian so that the newly 32-bit syscallno field maps onto the low bits of the stored value. This is not beautiful, but benchmarking of the getpid syscall on Juno suggests indicates a minor slowdown if the stp is split into an stp x and stp w. Signed-off-by: Dave Martin Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b738880350f9..3bf0bd7a2f29 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -142,8 +142,8 @@ alternative_else_nop_endif * Set syscallno to -1 by default (overridden later if real syscall). */ .if \el == 0 - mvn x21, xzr - str x21, [sp, #S_SYSCALLNO] + mvn w21, wzr + str w21, [sp, #S_SYSCALLNO] .endif /* @@ -290,8 +290,9 @@ alternative_else_nop_endif * * x7 is reserved for the system call number in 32-bit mode. */ -sc_nr .req x25 // number of system calls -scno .req x26 // syscall number +wsc_nr .req w25 // number of system calls +wscno .req w26 // syscall number +xscno .req x26 // syscall number (zero-extended) stbl .req x27 // syscall table pointer tsk .req x28 // current thread_info @@ -577,8 +578,8 @@ el0_svc_compat: * AArch32 syscall handling */ adrp stbl, compat_sys_call_table // load compat syscall table pointer - uxtw scno, w7 // syscall number in w7 (r7) - mov sc_nr, #__NR_compat_syscalls + mov wscno, w7 // syscall number in w7 (r7) + mov wsc_nr, #__NR_compat_syscalls b el0_svc_naked .align 6 @@ -798,19 +799,19 @@ ENDPROC(ret_from_fork) .align 6 el0_svc: adrp stbl, sys_call_table // load syscall table pointer - uxtw scno, w8 // syscall number in w8 - mov sc_nr, #__NR_syscalls + mov wscno, w8 // syscall number in w8 + mov wsc_nr, #__NR_syscalls el0_svc_naked: // compat entry point - stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number + stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number enable_dbg_and_irq ct_user_exit 1 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs ni_sys - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall ni_sys: @@ -824,24 +825,23 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - mov w0, #-1 // set default errno for - cmp scno, x0 // user-issued syscall(-1) + cmp wscno, #-1 // user-issued syscall(-1)? b.ne 1f - mov x0, #-ENOSYS + mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter cmp w0, #-1 // skip the syscall? b.eq __sys_trace_return_skipped - uxtw scno, w0 // syscall number (possibly new) + mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs __ni_sys_trace ldp x0, x1, [sp] // restore the syscall args ldp x2, x3, [sp, #S_X2] ldp x4, x5, [sp, #S_X4] ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine __sys_trace_return: -- cgit v1.2.3-59-g8ed1b From 17c28958600928109049a3bcc814b0d5bfb1ff3a Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 1 Aug 2017 15:35:54 +0100 Subject: arm64: Abstract syscallno manipulation The -1 "no syscall" value is written in various ways, shared with the user ABI in some places, and generally obscure. This patch attempts to make things a little more consistent and readable by replacing all these uses with a single #define. A couple of symbolic helpers are provided to clarify the intent further. Because the in-syscall check in do_signal() is changed from >= 0 to != NO_SYSCALL by this patch, different behaviour may be observable if syscallno is set to values less than -1 by a tracer. However, this is not different from the behaviour that is already observable if a tracer sets syscallno to a value >= __NR_(compat_)syscalls. It appears that this can cause spurious syscall restarting, but that is not a new behaviour either, and does not appear harmful. Signed-off-by: Dave Martin Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/include/asm/ptrace.h | 21 +++++++++++++++++++++ arch/arm64/kernel/entry.S | 10 ++++------ arch/arm64/kernel/ptrace.c | 2 +- arch/arm64/kernel/signal.c | 10 +++++----- arch/arm64/kernel/signal32.c | 2 +- 6 files changed, 33 insertions(+), 14 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 379def1d6b67..b7334f11c4cf 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -112,7 +112,7 @@ void tls_preserve_current_state(void); static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) { memset(regs, 0, sizeof(*regs)); - regs->syscallno = ~0; + forget_syscall(regs); regs->pc = pc; } diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 21c87dc240e1..4f64373b84fd 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -72,8 +72,19 @@ #define COMPAT_PT_TEXT_ADDR 0x10000 #define COMPAT_PT_DATA_ADDR 0x10004 #define COMPAT_PT_TEXT_END_ADDR 0x10008 + +/* + * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing + * a syscall -- i.e., its most recent entry into the kernel from + * userspace was not via SVC, or otherwise a tracer cancelled the syscall. + * + * This must have the value -1, for ABI compatibility with ptrace etc. + */ +#define NO_SYSCALL (-1) + #ifndef __ASSEMBLY__ #include +#include /* sizeof(struct user) for AArch32 */ #define COMPAT_USER_SZ 296 @@ -128,6 +139,16 @@ struct pt_regs { u64 unused; // maintain 16 byte alignment }; +static inline bool in_syscall(struct pt_regs const *regs) +{ + return regs->syscallno != NO_SYSCALL; +} + +static inline void forget_syscall(struct pt_regs *regs) +{ + regs->syscallno = NO_SYSCALL; +} + #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) #define arch_has_single_step() (1) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 3bf0bd7a2f29..cace76d17535 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -138,11 +138,9 @@ alternative_else_nop_endif stp x22, x23, [sp, #S_PC] - /* - * Set syscallno to -1 by default (overridden later if real syscall). - */ + /* Not in a syscall by default (el0_svc overwrites for real syscall) */ .if \el == 0 - mvn w21, wzr + mov w21, #NO_SYSCALL str w21, [sp, #S_SYSCALLNO] .endif @@ -825,13 +823,13 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - cmp wscno, #-1 // user-issued syscall(-1)? + cmp wscno, #NO_SYSCALL // user-issued syscall(-1)? b.ne 1f mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter - cmp w0, #-1 // skip the syscall? + cmp w0, #NO_SYSCALL // skip the syscall? b.eq __sys_trace_return_skipped mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index de774805f672..28619b5b6746 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1363,7 +1363,7 @@ static void tracehook_report_syscall(struct pt_regs *regs, if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - regs->syscallno = ~0; + forget_syscall(regs); regs->regs[regno] = saved_reg; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 4d04b891c00d..4991e87f80cc 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -387,7 +388,7 @@ static int restore_sigframe(struct pt_regs *regs, /* * Avoid sys_rt_sigreturn() restarting. */ - regs->syscallno = ~0; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); if (err == 0) @@ -673,13 +674,12 @@ static void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; - int syscall = regs->syscallno; struct ksignal ksig; /* * If we were from a system call, check for system call restarting... */ - if (syscall >= 0) { + if (in_syscall(regs)) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; @@ -687,7 +687,7 @@ static void do_signal(struct pt_regs *regs) /* * Avoid additional syscall restarting via ret_to_user. */ - regs->syscallno = ~0; + forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a @@ -731,7 +731,7 @@ static void do_signal(struct pt_regs *regs) * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ - if (syscall >= 0 && regs->pc == restart_addr) { + if (in_syscall(regs) && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index d98ca76cbd39..4e5a664be04b 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -354,7 +354,7 @@ static int compat_restore_sigframe(struct pt_regs *regs, /* * Avoid compat_sys_sigreturn() restarting. */ - regs->syscallno = ~0; + forget_syscall(regs); err |= !valid_user_regs(®s->user_regs, current); -- cgit v1.2.3-59-g8ed1b From 2d0e751a4789fc5ab4a5c9de5d6407b41fdfbbf0 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 26 Jul 2017 11:14:53 +0100 Subject: arm64: consistently use bl for C exception entry In most cases, our exception entry assembly branches to C handlers with a BL instruction, but in cases where we do not expect to return, we use B instead. While this is correct today, it means that backtraces for fatal exceptions miss the entry assembly (as the LR is stale at the point we call C code), while non-fatal exceptions have the entry assembly in the LR. In subsequent patches, we will need the LR to be set in these cases in order to backtrace reliably. This patch updates these sites to use a BL, ensuring consistency, and preparing for backtrace rework. An ASM_BUG() is added after each of these new BLs, which both catches unexpected returns, and ensures that the LR value doesn't point to another function label. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon --- arch/arm64/kernel/entry.S | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b738880350f9..660612a07ec5 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -351,7 +351,8 @@ END(vectors) mov x0, sp mov x1, #\reason mrs x2, esr_el1 - b bad_mode + bl bad_mode + ASM_BUG() .endm el0_sync_invalid: @@ -448,14 +449,16 @@ el1_sp_pc: mrs x0, far_el1 enable_dbg mov x2, sp - b do_sp_pc_abort + bl do_sp_pc_abort + ASM_BUG() el1_undef: /* * Undefined instruction */ enable_dbg mov x0, sp - b do_undefinstr + bl do_undefinstr + ASM_BUG() el1_dbg: /* * Debug exception handling @@ -473,7 +476,8 @@ el1_inv: mov x0, sp mov x2, x1 mov x1, #BAD_SYNC - b bad_mode + bl bad_mode + ASM_BUG() ENDPROC(el1_sync) .align 6 -- cgit v1.2.3-59-g8ed1b From ed84b4e9582bdfeffc617589fe17dddfc5fe6672 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 26 Jul 2017 16:05:20 +0100 Subject: arm64: move non-entry code out of .entry.text Currently, cpu_switch_to and ret_from_fork both live in .entry.text, though neither form the critical path for an exception entry. In subsequent patches, we will require that code in .entry.text is part of the critical path for exception entry, for which we can assume certain properties (e.g. the presence of exception regs on the stack). Neither cpu_switch_to nor ret_from_fork will meet these requirements, so we must move them out of .entry.text. To ensure that neither are kprobed after being moved out of .entry.text, we must explicitly blacklist them, requiring a new NOKPROBE() asm helper. Signed-off-by: Mark Rutland Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon --- arch/arm64/include/asm/assembler.h | 11 +++++ arch/arm64/kernel/entry.S | 90 +++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 44 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 1b67c3782d00..610a42018241 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -403,6 +403,17 @@ alternative_endif .size __pi_##x, . - x; \ ENDPROC(x) +/* + * Annotate a function as being unsuitable for kprobes. + */ +#ifdef CONFIG_KPROBES +#define NOKPROBE(x) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad x; \ + .popsection; +#else +#define NOKPROBE(x) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 660612a07ec5..9e126d3d8b53 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -710,38 +710,6 @@ el0_irq_naked: b ret_to_user ENDPROC(el0_irq) -/* - * Register switch for AArch64. The callee-saved registers need to be saved - * and restored. On entry: - * x0 = previous task_struct (must be preserved across the switch) - * x1 = next task_struct - * Previous and next are guaranteed not to be the same. - * - */ -ENTRY(cpu_switch_to) - mov x10, #THREAD_CPU_CONTEXT - add x8, x0, x10 - mov x9, sp - stp x19, x20, [x8], #16 // store callee-saved registers - stp x21, x22, [x8], #16 - stp x23, x24, [x8], #16 - stp x25, x26, [x8], #16 - stp x27, x28, [x8], #16 - stp x29, x9, [x8], #16 - str lr, [x8] - add x8, x1, x10 - ldp x19, x20, [x8], #16 // restore callee-saved registers - ldp x21, x22, [x8], #16 - ldp x23, x24, [x8], #16 - ldp x25, x26, [x8], #16 - ldp x27, x28, [x8], #16 - ldp x29, x9, [x8], #16 - ldr lr, [x8] - mov sp, x9 - msr sp_el0, x1 - ret -ENDPROC(cpu_switch_to) - /* * This is the fast syscall return path. We do as little as possible here, * and this includes saving x0 back into the kernel stack. @@ -784,18 +752,6 @@ finish_ret_to_user: kernel_exit 0 ENDPROC(ret_to_user) -/* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - bl schedule_tail - cbz x19, 1f // not a kernel thread - mov x0, x20 - blr x19 -1: get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_fork) - /* * SVC handler. */ @@ -869,3 +825,49 @@ ENTRY(sys_rt_sigreturn_wrapper) mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +/* + * Register switch for AArch64. The callee-saved registers need to be saved + * and restored. On entry: + * x0 = previous task_struct (must be preserved across the switch) + * x1 = next task_struct + * Previous and next are guaranteed not to be the same. + * + */ +ENTRY(cpu_switch_to) + mov x10, #THREAD_CPU_CONTEXT + add x8, x0, x10 + mov x9, sp + stp x19, x20, [x8], #16 // store callee-saved registers + stp x21, x22, [x8], #16 + stp x23, x24, [x8], #16 + stp x25, x26, [x8], #16 + stp x27, x28, [x8], #16 + stp x29, x9, [x8], #16 + str lr, [x8] + add x8, x1, x10 + ldp x19, x20, [x8], #16 // restore callee-saved registers + ldp x21, x22, [x8], #16 + ldp x23, x24, [x8], #16 + ldp x25, x26, [x8], #16 + ldp x27, x28, [x8], #16 + ldp x29, x9, [x8], #16 + ldr lr, [x8] + mov sp, x9 + msr sp_el0, x1 + ret +ENDPROC(cpu_switch_to) +NOKPROBE(cpu_switch_to) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cbz x19, 1f // not a kernel thread + mov x0, x20 + blr x19 +1: get_thread_info tsk + b ret_to_user +ENDPROC(ret_from_fork) +NOKPROBE(ret_from_fork) -- cgit v1.2.3-59-g8ed1b From 7326749801396105aef0ed9229df746ac9e24300 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 22 Jul 2017 18:45:33 +0100 Subject: arm64: unwind: reference pt_regs via embedded stack frame As it turns out, the unwind code is slightly broken, and probably has been for a while. The problem is in the dumping of the exception stack, which is intended to dump the contents of the pt_regs struct at each level in the call stack where an exception was taken and routed to a routine marked as __exception (which means its stack frame is right below the pt_regs struct on the stack). 'Right below the pt_regs struct' is ill defined, though: the unwind code assigns 'frame pointer + 0x10' to the .sp member of the stackframe struct at each level, and dump_backtrace() happily dereferences that as the pt_regs pointer when encountering an __exception routine. However, the actual size of the stack frame created by this routine (which could be one of many __exception routines we have in the kernel) is not known, and so frame.sp is pretty useless to figure out where struct pt_regs really is. So it seems the only way to ensure that we can find our struct pt_regs when walking the stack frames is to put it at a known fixed offset of the stack frame pointer that is passed to such __exception routines. The simplest way to do that is to put it inside pt_regs itself, which is the main change implemented by this patch. As a bonus, doing this allows us to get rid of a fair amount of cruft related to walking from one stack to the other, which is especially nice since we intend to introduce yet another stack for overflow handling once we add support for vmapped stacks. It also fixes an inconsistency where we only add a stack frame pointing to ELR_EL1 if we are executing from the IRQ stack but not when we are executing from the task stack. To consistly identify exceptions regs even in the presence of exceptions taken from entry code, we must check whether the next frame was created by entry text, rather than whether the current frame was crated by exception text. To avoid backtracing using PCs that fall in the idmap, or are controlled by userspace, we must explcitly zero the FP and LR in startup paths, and must ensure that the frame embedded in pt_regs is zeroed upon entry from EL0. To avoid these NULL entries showin in the backtrace, unwind_frame() is updated to avoid them. Signed-off-by: Ard Biesheuvel [Mark: compare current frame against .entry.text, avoid bogus PCs] Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon --- arch/arm64/include/asm/irq.h | 25 ------------------------- arch/arm64/include/asm/ptrace.h | 1 + arch/arm64/include/asm/traps.h | 5 +++++ arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 20 ++++++++++++-------- arch/arm64/kernel/head.S | 4 ++++ arch/arm64/kernel/stacktrace.c | 33 ++++++--------------------------- arch/arm64/kernel/traps.c | 32 +++++++------------------------- 8 files changed, 36 insertions(+), 85 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8155e486ce48..8ba89c4ca183 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -16,31 +16,6 @@ struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); -/* - * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S, which - * is structured as follows: - * - * ------------ - * | | <- irq_stack_ptr - * top ------------ - * | x19 | <- irq_stack_ptr - 0x08 - * ------------ - * | x29 | <- irq_stack_ptr - 0x10 - * ------------ - * - * where x19 holds a copy of the task stack pointer where the struct pt_regs - * from kernel_entry can be found. - * - */ -#define IRQ_STACK_PTR() ((unsigned long)raw_cpu_ptr(irq_stack) + IRQ_STACK_START_SP) - -/* - * The offset from irq_stack_ptr where entry.S will store the original - * stack pointer. Used by unwind_frame() and dump_backtrace(). - */ -#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) - extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 11403fdd0a50..ee72aa979078 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -119,6 +119,7 @@ struct pt_regs { u64 syscallno; u64 orig_addr_limit; u64 unused; // maintain 16 byte alignment + u64 stackframe[2]; }; #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index 02e9035b0685..41361684580d 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -60,4 +60,9 @@ static inline int in_exception_text(unsigned long ptr) return in ? : __in_irqentry_text(ptr); } +static inline int in_entry_text(unsigned long ptr) +{ + return ptr >= (unsigned long)&__entry_text_start && + ptr < (unsigned long)&__entry_text_end; +} #endif diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index b3bb7ef97bc8..71bf088f1e4b 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -75,6 +75,7 @@ int main(void) DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 9e126d3d8b53..612a077ba109 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -111,6 +111,18 @@ mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + /* + * In order to be able to dump the contents of struct pt_regs at the + * time the exception was taken (in case we attempt to walk the call + * stack later), chain it together with the stack frames. + */ + .if \el == 0 + stp xzr, xzr, [sp, #S_STACKFRAME] + .else + stp x29, x22, [sp, #S_STACKFRAME] + .endif + add x29, sp, #S_STACKFRAME + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Set the TTBR0 PAN bit in SPSR. When the exception is taken from @@ -265,14 +277,6 @@ alternative_else_nop_endif /* switch to the irq stack */ mov sp, x26 - - /* - * Add a dummy stack frame, this non-standard format is fixed up - * by unwind_frame() - */ - stp x29, x19, [sp, #-16]! - mov x29, sp - 9998: .endm diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 973df7de7bf8..f9e4aacf4f42 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -362,6 +362,9 @@ __primary_switched: ret // to __primary_switch() 0: #endif + add sp, sp, #16 + mov x29, #0 + mov x30, #0 b start_kernel ENDPROC(__primary_switched) @@ -617,6 +620,7 @@ __secondary_switched: ldr x2, [x0, #CPU_BOOT_TASK] msr sp_el0, x2 mov x29, #0 + mov x30, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index beaf51fb3088..81d9262acaf0 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -76,34 +76,13 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* - * Check whether we are going to walk through from interrupt stack - * to task stack. - * If we reach the end of the stack - and its an interrupt stack, - * unpack the dummy frame to find the original elr. - * - * Check the frame->fp we read from the bottom of the irq_stack, - * and the original task stack pointer are both in current->stack. + * Frames created upon entry from EL0 have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. */ - if (frame->sp == IRQ_STACK_PTR()) { - struct pt_regs *irq_args; - unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(frame->sp); - - if (object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) { - frame->sp = orig_sp; - - /* orig_sp is the saved pt_regs, find the elr */ - irq_args = (struct pt_regs *)orig_sp; - frame->pc = irq_args->pc; - } else { - /* - * This frame has a non-standard format, and we - * didn't fix it, because the data looked wrong. - * Refuse to output this frame. - */ - return -EINVAL; - } - } + if (!frame->fp && !frame->pc) + return -EINVAL; return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 5797f5037ec9..075c29a24345 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -143,7 +143,6 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr; int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -154,15 +153,6 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!try_get_task_stack(tsk)) return; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(); - else - irq_stack_ptr = 0; - if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; @@ -182,13 +172,12 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) skip = !!regs; printk("Call trace:\n"); while (1) { - unsigned long where = frame.pc; unsigned long stack; int ret; /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(where); + dump_backtrace_entry(frame.pc); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -203,20 +192,13 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) ret = unwind_frame(tsk, &frame); if (ret < 0) break; - stack = frame.sp; - if (in_exception_text(where)) { - /* - * If we switched to the irq_stack before calling this - * exception handler, then the pt_regs will be on the - * task stack. The easiest way to tell is if the large - * pt_regs would overlap with the end of the irq_stack. - */ - if (stack < irq_stack_ptr && - (stack + sizeof(struct pt_regs)) > irq_stack_ptr) - stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (in_entry_text(frame.pc)) { + stack = frame.fp - offsetof(struct pt_regs, stackframe); - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (on_task_stack(tsk, stack) || + (tsk == current && !preemptible() && on_irq_stack(stack))) + dump_mem("", "Exception stack", stack, + stack + sizeof(struct pt_regs)); } } -- cgit v1.2.3-59-g8ed1b From 34be98f4944f99076f049a6806fc5f5207a755d3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 20 Jul 2017 17:15:45 +0100 Subject: arm64: kernel: remove {THREAD,IRQ_STACK}_START_SP For historical reasons, we leave the top 16 bytes of our task and IRQ stacks unused, a practice used to ensure that the SP can always be masked to find the base of the current stack (historically, where thread_info could be found). However, this is not necessary, as: * When an exception is taken from a task stack, we decrement the SP by S_FRAME_SIZE and stash the exception registers before we compare the SP against the task stack. In such cases, the SP must be at least S_FRAME_SIZE below the limit, and can be safely masked to determine whether the task stack is in use. * When transitioning to an IRQ stack, we'll place a dummy frame onto the IRQ stack before enabling asynchronous exceptions, or executing code we expect to trigger faults. Thus, if an exception is taken from the IRQ stack, the SP must be at least 16 bytes below the limit. * We no longer mask the SP to find the thread_info, which is now found via sp_el0. Note that historically, the offset was critical to ensure that cpu_switch_to() found the correct stack for new threads that hadn't yet executed ret_from_fork(). Given that, this initial offset serves no purpose, and can be removed. This brings us in-line with other architectures (e.g. x86) which do not rely on this masking. Signed-off-by: Ard Biesheuvel [Mark: rebase, kill THREAD_START_SP, commit msg additions] Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Catalin Marinas Cc: James Morse --- arch/arm64/include/asm/irq.h | 5 ++--- arch/arm64/include/asm/processor.h | 2 +- arch/arm64/include/asm/thread_info.h | 1 - arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/smp.c | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8ba89c4ca183..1ebe202b1a24 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -2,7 +2,6 @@ #define __ASM_IRQ_H #define IRQ_STACK_SIZE THREAD_SIZE -#define IRQ_STACK_START_SP THREAD_START_SP #ifndef __ASSEMBLER__ @@ -26,9 +25,9 @@ static inline int nr_legacy_irqs(void) static inline bool on_irq_stack(unsigned long sp) { unsigned long low = (unsigned long)raw_cpu_ptr(irq_stack); - unsigned long high = low + IRQ_STACK_START_SP; + unsigned long high = low + IRQ_STACK_SIZE; - return (low <= sp && sp <= high); + return (low <= sp && sp < high); } static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 64c9e78f9882..6687dd29f7e0 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -159,7 +159,7 @@ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); #define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) + ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 46c3b93cf865..b29ab0e12e60 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -30,7 +30,6 @@ #endif #define THREAD_SIZE 16384 -#define THREAD_START_SP (THREAD_SIZE - 16) #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 612a077ba109..f31c7b26a686 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -272,7 +272,7 @@ alternative_else_nop_endif cbnz x25, 9998f adr_this_cpu x25, irq_stack, x26 - mov x26, #IRQ_STACK_START_SP + mov x26, #IRQ_STACK_SIZE add x26, x25, x26 /* switch to the irq stack */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dc66e6ec3a99..f13ddb2404f9 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -154,7 +154,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; + secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); -- cgit v1.2.3-59-g8ed1b From b11e5759bfac0c474d95ec4780b1566350e64cad Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 19 Jul 2017 17:24:49 +0100 Subject: arm64: factor out entry stack manipulation In subsequent patches, we will detect stack overflow in our exception entry code, by verifying the SP after it has been decremented to make space for the exception regs. This verification code is small, and we can minimize its impact by placing it directly in the vectors. To avoid redundant modification of the SP, we also need to move the initial decrement of the SP into the vectors. As a preparatory step, this patch introduces kernel_ventry, which performs this decrement, and updates the entry code accordingly. Subsequent patches will fold SP verification into kernel_ventry. There should be no functional change as a result of this patch. Signed-off-by: Ard Biesheuvel [Mark: turn into prep patch, expand commit msg] Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Catalin Marinas Cc: James Morse --- arch/arm64/kernel/entry.S | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f31c7b26a686..58eba94279c5 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -69,8 +69,13 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry label + .align 7 sub sp, sp, #S_FRAME_SIZE + b \label + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -319,31 +324,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t + kernel_ventry el1_sync_invalid // Synchronous EL1t + kernel_ventry el1_irq_invalid // IRQ EL1t + kernel_ventry el1_fiq_invalid // FIQ EL1t + kernel_ventry el1_error_invalid // Error EL1t - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h + kernel_ventry el1_sync // Synchronous EL1h + kernel_ventry el1_irq // IRQ EL1h + kernel_ventry el1_fiq_invalid // FIQ EL1h + kernel_ventry el1_error_invalid // Error EL1h - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry el0_sync // Synchronous 64-bit EL0 + kernel_ventry el0_irq // IRQ 64-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 + kernel_ventry el0_error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 + kernel_ventry el0_irq_compat // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid_compat // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 + kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 + kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 + kernel_ventry el0_error_invalid // Error 32-bit EL0 #endif END(vectors) -- cgit v1.2.3-59-g8ed1b From f60fe78f133243e6de0f05fdefc3ed2f3c5085ca Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 31 Jul 2017 21:17:03 +0100 Subject: arm64: use an irq stack pointer We allocate our IRQ stacks using a percpu array. This allows us to generate our IRQ stack pointers with adr_this_cpu, but bloats the kernel Image with the boot CPU's IRQ stack. Additionally, these are packed with other percpu variables, and aren't guaranteed to have guard pages. When we enable VMAP_STACK we'll want to vmap our IRQ stacks also, in order to provide guard pages and to permit more stringent alignment requirements. Doing so will require that we use a percpu pointer to each IRQ stack, rather than allocating a percpu IRQ stack in the kernel image. This patch updates our IRQ stack code to use a percpu pointer to the base of each IRQ stack. This will allow us to change the way the stack is allocated with minimal changes elsewhere. In some cases we may try to backtrace before the IRQ stack pointers are initialised, so on_irq_stack() is updated to account for this. In testing with cyclictest, there was no measureable difference between using adr_this_cpu (for irq_stack) and ldr_this_cpu (for irq_stack_ptr) in the IRQ entry path. Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: James Morse --- arch/arm64/include/asm/stacktrace.h | 7 +++++-- arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/irq.c | 10 ++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 000e24182a5c..4c68d8a81988 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -36,13 +36,16 @@ extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); -DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); +DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); static inline bool on_irq_stack(unsigned long sp) { - unsigned long low = (unsigned long)raw_cpu_ptr(irq_stack); + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); unsigned long high = low + IRQ_STACK_SIZE; + if (!low) + return false; + return (low <= sp && sp < high); } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 58eba94279c5..52348869f82f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -276,7 +276,7 @@ alternative_else_nop_endif and x25, x25, #~(THREAD_SIZE - 1) cbnz x25, 9998f - adr_this_cpu x25, irq_stack, x26 + ldr_this_cpu x25, irq_stack_ptr, x26 mov x26, #IRQ_STACK_SIZE add x26, x25, x26 diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 2386b26c0712..5141282e47d5 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -32,6 +32,7 @@ unsigned long irq_err_count; /* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); int arch_show_interrupts(struct seq_file *p, int prec) { @@ -50,8 +51,17 @@ void __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) handle_arch_irq = handle_irq; } +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} + void __init init_IRQ(void) { + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); -- cgit v1.2.3-59-g8ed1b From 872d8327ce8982883b8237b2c320c8666f14e561 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 14 Jul 2017 20:30:35 +0100 Subject: arm64: add VMAP_STACK overflow detection This patch adds stack overflow detection to arm64, usable when vmap'd stacks are in use. Overflow is detected in a small preamble executed for each exception entry, which checks whether there is enough space on the current stack for the general purpose registers to be saved. If there is not enough space, the overflow handler is invoked on a per-cpu overflow stack. This approach preserves the original exception information in ESR_EL1 (and where appropriate, FAR_EL1). Task and IRQ stacks are aligned to double their size, enabling overflow to be detected with a single bit test. For example, a 16K stack is aligned to 32K, ensuring that bit 14 of the SP must be zero. On an overflow (or underflow), this bit is flipped. Thus, overflow (of less than the size of the stack) can be detected by testing whether this bit is set. The overflow check is performed before any attempt is made to access the stack, avoiding recursive faults (and the loss of exception information these would entail). As logical operations cannot be performed on the SP directly, the SP is temporarily swapped with a general purpose register using arithmetic operations to enable the test to be performed. This gives us a useful error message on stack overflow, as can be trigger with the LKDTM overflow test: [ 305.388749] lkdtm: Performing direct entry OVERFLOW [ 305.395444] Insufficient stack space to handle exception! [ 305.395482] ESR: 0x96000047 -- DABT (current EL) [ 305.399890] FAR: 0xffff00000a5e7f30 [ 305.401315] Task stack: [0xffff00000a5e8000..0xffff00000a5ec000] [ 305.403815] IRQ stack: [0xffff000008000000..0xffff000008004000] [ 305.407035] Overflow stack: [0xffff80003efce4e0..0xffff80003efcf4e0] [ 305.409622] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.412785] Hardware name: linux,dummy-virt (DT) [ 305.415756] task: ffff80003d051c00 task.stack: ffff00000a5e8000 [ 305.419221] PC is at recursive_loop+0x10/0x48 [ 305.421637] LR is at recursive_loop+0x38/0x48 [ 305.423768] pc : [] lr : [] pstate: 40000145 [ 305.428020] sp : ffff00000a5e7f50 [ 305.430469] x29: ffff00000a5e8350 x28: ffff80003d051c00 [ 305.433191] x27: ffff000008981000 x26: ffff000008f80400 [ 305.439012] x25: ffff00000a5ebeb8 x24: ffff00000a5ebeb8 [ 305.440369] x23: ffff000008f80138 x22: 0000000000000009 [ 305.442241] x21: ffff80003ce65000 x20: ffff000008f80188 [ 305.444552] x19: 0000000000000013 x18: 0000000000000006 [ 305.446032] x17: 0000ffffa2601280 x16: ffff0000081fe0b8 [ 305.448252] x15: ffff000008ff546d x14: 000000000047a4c8 [ 305.450246] x13: ffff000008ff7872 x12: 0000000005f5e0ff [ 305.452953] x11: ffff000008ed2548 x10: 000000000005ee8d [ 305.454824] x9 : ffff000008545380 x8 : ffff00000a5e8770 [ 305.457105] x7 : 1313131313131313 x6 : 00000000000000e1 [ 305.459285] x5 : 0000000000000000 x4 : 0000000000000000 [ 305.461781] x3 : 0000000000000000 x2 : 0000000000000400 [ 305.465119] x1 : 0000000000000013 x0 : 0000000000000012 [ 305.467724] Kernel panic - not syncing: kernel stack overflow [ 305.470561] CPU: 0 PID: 1219 Comm: sh Not tainted 4.13.0-rc3-00021-g9636aea #5 [ 305.473325] Hardware name: linux,dummy-virt (DT) [ 305.475070] Call trace: [ 305.476116] [] dump_backtrace+0x0/0x378 [ 305.478991] [] show_stack+0x14/0x20 [ 305.481237] [] dump_stack+0x98/0xb8 [ 305.483294] [] panic+0x118/0x280 [ 305.485673] [] nmi_panic+0x6c/0x70 [ 305.486216] [] handle_bad_stack+0x118/0x128 [ 305.486612] Exception stack(0xffff80003efcf3a0 to 0xffff80003efcf4e0) [ 305.487334] f3a0: 0000000000000012 0000000000000013 0000000000000400 0000000000000000 [ 305.488025] f3c0: 0000000000000000 0000000000000000 00000000000000e1 1313131313131313 [ 305.488908] f3e0: ffff00000a5e8770 ffff000008545380 000000000005ee8d ffff000008ed2548 [ 305.489403] f400: 0000000005f5e0ff ffff000008ff7872 000000000047a4c8 ffff000008ff546d [ 305.489759] f420: ffff0000081fe0b8 0000ffffa2601280 0000000000000006 0000000000000013 [ 305.490256] f440: ffff000008f80188 ffff80003ce65000 0000000000000009 ffff000008f80138 [ 305.490683] f460: ffff00000a5ebeb8 ffff00000a5ebeb8 ffff000008f80400 ffff000008981000 [ 305.491051] f480: ffff80003d051c00 ffff00000a5e8350 ffff00000859f358 ffff00000a5e7f50 [ 305.491444] f4a0: ffff00000859f330 0000000040000145 0000000000000000 0000000000000000 [ 305.492008] f4c0: 0001000000000000 0000000000000000 ffff00000a5e8350 ffff00000859f330 [ 305.493063] [] __bad_stack+0x88/0x8c [ 305.493396] [] recursive_loop+0x10/0x48 [ 305.493731] [] recursive_loop+0x38/0x48 [ 305.494088] [] recursive_loop+0x38/0x48 [ 305.494425] [] recursive_loop+0x38/0x48 [ 305.494649] [] recursive_loop+0x38/0x48 [ 305.494898] [] recursive_loop+0x38/0x48 [ 305.495205] [] recursive_loop+0x38/0x48 [ 305.495453] [] recursive_loop+0x38/0x48 [ 305.495708] [] recursive_loop+0x38/0x48 [ 305.496000] [] recursive_loop+0x38/0x48 [ 305.496302] [] recursive_loop+0x38/0x48 [ 305.496644] [] recursive_loop+0x38/0x48 [ 305.496894] [] recursive_loop+0x38/0x48 [ 305.497138] [] recursive_loop+0x38/0x48 [ 305.497325] [] lkdtm_OVERFLOW+0x14/0x20 [ 305.497506] [] lkdtm_do_action+0x1c/0x28 [ 305.497786] [] direct_entry+0xe0/0x170 [ 305.498095] [] full_proxy_write+0x60/0xa8 [ 305.498387] [] __vfs_write+0x1c/0x128 [ 305.498679] [] vfs_write+0xa0/0x1b0 [ 305.498926] [] SyS_write+0x44/0xa0 [ 305.499182] Exception stack(0xffff00000a5ebec0 to 0xffff00000a5ec000) [ 305.499429] bec0: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.499674] bee0: 574f4c465245564f 0000000000000000 0000000000000000 8000000080808080 [ 305.499904] bf00: 0000000000000040 0000000000000038 fefefeff1b4bc2ff 7f7f7f7f7f7fff7f [ 305.500189] bf20: 0101010101010101 0000000000000000 000000000047a4c8 0000000000000038 [ 305.500712] bf40: 0000000000000000 0000ffffa2601280 0000ffffc63f6068 00000000004b5000 [ 305.501241] bf60: 0000000000000001 000000001c4cf5e0 0000000000000009 000000001c4cf5e0 [ 305.501791] bf80: 0000000000000020 0000000000000000 00000000004b5000 000000001c4cc458 [ 305.502314] bfa0: 0000000000000000 0000ffffc63f7950 000000000040a3c4 0000ffffc63f70e0 [ 305.502762] bfc0: 0000ffffa2601268 0000000080000000 0000000000000001 0000000000000040 [ 305.503207] bfe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 305.503680] [] el0_svc_naked+0x24/0x28 [ 305.504720] Kernel Offset: disabled [ 305.505189] CPU features: 0x002082 [ 305.505473] Memory Limit: none [ 305.506181] ---[ end Kernel panic - not syncing: kernel stack overflow This patch was co-authored by Ard Biesheuvel and Mark Rutland. Signed-off-by: Ard Biesheuvel Signed-off-by: Mark Rutland Reviewed-by: Will Deacon Tested-by: Laura Abbott Cc: Catalin Marinas Cc: James Morse --- arch/arm64/include/asm/memory.h | 2 ++ arch/arm64/include/asm/stacktrace.h | 16 +++++++++ arch/arm64/kernel/entry.S | 70 +++++++++++++++++++++++++++++++++++++ arch/arm64/kernel/traps.c | 39 +++++++++++++++++++++ 4 files changed, 127 insertions(+) (limited to 'arch/arm64/kernel/entry.S') diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index c5cd2c599b24..1a025b744107 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -133,6 +133,8 @@ #define IRQ_STACK_SIZE THREAD_SIZE +#define OVERFLOW_STACK_SIZE SZ_4K + /* * Alignment of kernel segments (e.g. .text, .data). */ diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 92ddb6d25cf3..6ad30776e984 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -57,6 +57,20 @@ static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) return (low <= sp && sp < high); } +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + /* * We can only safely access per-cpu stacks from current in a non-preemptible * context. @@ -69,6 +83,8 @@ static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp return false; if (on_irq_stack(sp)) return true; + if (on_overflow_stack(sp)) + return true; return false; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 52348869f82f..3ef6e2297fb4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -72,6 +72,48 @@ .macro kernel_ventry label .align 7 sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + b \label + +0: + /* + * Either we've just detected an overflow, or we've taken an exception + * while on the overflow stack. Either way, we won't return to + * userspace, and can clobber EL0 registers to free up GPRs. + */ + + /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif b \label .endm @@ -352,6 +394,34 @@ ENTRY(vectors) #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginal SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index d01c5988354b..2d591804e46f 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -666,6 +668,43 @@ asmlinkage void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + __show_regs(regs); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val); -- cgit v1.2.3-59-g8ed1b