diff options
Diffstat (limited to 'arch/x86/kernel/traps.c')
-rw-r--r-- | arch/x86/kernel/traps.c | 391 |
1 files changed, 255 insertions, 136 deletions
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d62b2cb85cea..36354b470590 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/context_tracking.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> +#include <linux/kmsan.h> #include <linux/spinlock.h> #include <linux/kprobes.h> #include <linux/uaccess.h> @@ -36,10 +37,12 @@ #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> -#include <linux/ioasid.h> +#include <linux/iommu.h> +#include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -49,6 +52,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> +#include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> @@ -63,29 +67,19 @@ #include <asm/insn-eval.h> #include <asm/vdso.h> #include <asm/tdx.h> +#include <asm/cfi.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/proto.h> #else #include <asm/processor-flags.h> #include <asm/setup.h> -#include <asm/proto.h> #endif -DECLARE_BITMAP(system_vectors, NR_VECTORS); - -static inline void cond_local_irq_enable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} +#include <asm/proto.h> -static inline void cond_local_irq_disable(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); -} +DECLARE_BITMAP(system_vectors, NR_VECTORS); __always_inline int is_valid_bugaddr(unsigned long addr) { @@ -99,6 +93,96 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, further decode to determine its use: + * + * FineIBT: ea (bad) + * FineIBT: f0 75 f9 lock jne . - 6 + * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax + * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax + * static_call: 0f b9 cc ud1 %esp,%ecx + * + * Notably UBSAN uses EAX, static_call uses ECX. + */ +__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) +{ + unsigned long start = addr; + bool lock = false; + u8 v; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + v = *(u8 *)(addr++); + + if (v == INSN_LOCK) { + lock = true; + v = *(u8 *)(addr++); + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xea: + *len = addr - start; + return BUG_EA; + + case OPCODE_ESCAPE: + break; + + default: + return BUG_NONE; + } + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) { + *len = addr - start; + return BUG_UD2; + } + + if (v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + *imm = 0; + v = *(u8 *)(addr++); /* ModRM */ + + if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) + addr++; /* SIB */ + + /* Decode immediate, if present */ + switch (X86_MODRM_MOD(v)) { + case 0: if (X86_MODRM_RM(v) == 5) + addr += 4; /* RIP + disp32 */ + break; + + case 1: *imm = *(s8 *)addr; + addr += 1; + break; + + case 2: *imm = *(s32 *)addr; + addr += 4; + break; + + case 3: break; + } + + /* record instruction length */ + *len = addr - start; + + if (X86_MODRM_REG(v) == 0) /* EAX */ + return BUG_UD1_UBSAN; + + return BUG_UD1; +} + + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -211,81 +295,6 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } -#ifdef CONFIG_X86_KERNEL_IBT - -static __ro_after_init bool ibt_fatal = true; - -extern void ibt_selftest_ip(void); /* code label defined in asm below */ - -enum cp_error_code { - CP_EC = (1 << 15) - 1, - - CP_RET = 1, - CP_IRET = 2, - CP_ENDBR = 3, - CP_RSTRORSSP = 4, - CP_SETSSBSY = 5, - - CP_ENCL = 1 << 15, -}; - -DEFINE_IDTENTRY_ERRORCODE(exc_control_protection) -{ - if (!cpu_feature_enabled(X86_FEATURE_IBT)) { - pr_err("Unexpected #CP\n"); - BUG(); - } - - if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR)) - return; - - if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) { - regs->ax = 0; - return; - } - - pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs)); - if (!ibt_fatal) { - printk(KERN_DEFAULT CUT_HERE); - __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); - return; - } - BUG(); -} - -/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */ -noinline bool ibt_selftest(void) -{ - unsigned long ret; - - asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t" - ANNOTATE_RETPOLINE_SAFE - " jmp *%%rax\n\t" - "ibt_selftest_ip:\n\t" - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - " nop\n\t" - - : "=a" (ret) : : "memory"); - - return !ret; -} - -static int __init ibt_setup(char *str) -{ - if (!strcmp(str, "off")) - setup_clear_cpu_cap(X86_FEATURE_IBT); - - if (!strcmp(str, "warn")) - ibt_fatal = false; - - return 1; -} - -__setup("ibt=", ibt_setup); - -#endif /* CONFIG_X86_KERNEL_IBT */ - #ifdef CONFIG_X86_F00F_BUG void handle_invalid_op(struct pt_regs *regs) #else @@ -298,9 +307,13 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; + int ud_type, ud_len; + s32 ud_imm; - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(addr, &ud_imm, &ud_len); + if (ud_type == BUG_NONE) return handled; /* @@ -308,15 +321,58 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + + switch (ud_type) { + case BUG_UD2: + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + fallthrough; + + case BUG_EA: + case BUG_LOCK: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + break; + + case BUG_UD1_UBSAN: + if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", + report_ubsan_failure(ud_imm), + (void *)regs->ip); + } + break; + + default: + break; } + + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; + } + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); @@ -407,6 +463,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, #endif /* + * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 + * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch + * between configs triggers objtool warnings. + * + * This is a temporary hack until we have compiler or plugin support for + * annotating noreturns. + */ +#ifdef CONFIG_X86_ESPFIX64 +#define always_true() true +#else +bool always_true(void); +bool __weak always_true(void) { return true; } +#endif + +/* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the @@ -541,7 +612,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault) pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); - panic("Machine halted."); + if (always_true()) + panic("Machine halted."); instrumentation_end(); } @@ -643,7 +715,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs) */ static bool try_fixup_enqcmd_gp(void) { -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* @@ -662,15 +734,15 @@ static bool try_fixup_enqcmd_gp(void) if (!cpu_feature_enabled(X86_FEATURE_ENQCMD)) return false; - pasid = current->mm->pasid; - /* * If the mm has not been allocated a * PASID, the #GP can not be fixed up. */ - if (!pasid_valid(pasid)) + if (!mm_valid_pasid(current->mm)) return false; + pasid = mm_get_enqcmd_pasid(current->mm); + /* * Did this thread already have its PASID activated? * If so, the #GP must be from something else. @@ -678,7 +750,7 @@ static bool try_fixup_enqcmd_gp(void) if (current->pasid_activated) return false; - wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; @@ -688,9 +760,10 @@ static bool try_fixup_enqcmd_gp(void) } static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr, - unsigned long error_code, const char *str) + unsigned long error_code, const char *str, + unsigned long address) { - if (fixup_exception(regs, trapnr, error_code, 0)) + if (fixup_exception(regs, trapnr, error_code, address)) return true; current->thread.error_code = error_code; @@ -750,7 +823,7 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) goto exit; } - if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc)) + if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc, 0)) goto exit; if (error_code) @@ -810,16 +883,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. @@ -849,7 +922,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -867,7 +940,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(cpu_current_top_of_stack); + sp = current_top_of_stack(); goto sync; } @@ -949,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. + * + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. + * + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. * - * Keep it simple: clear DR6 immediately. + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -1011,8 +1092,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -1024,6 +1104,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -1044,16 +1129,17 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, */ unsigned long debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1085,8 +1171,7 @@ out: local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1162,19 +1247,47 @@ out: /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_reset_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); @@ -1191,7 +1304,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -1282,11 +1395,11 @@ static bool handle_xfd_event(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; - rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) @@ -1348,17 +1461,20 @@ DEFINE_IDTENTRY(exc_device_not_available) #define VE_FAULT_STR "VE fault" -static void ve_raise_fault(struct pt_regs *regs, long error_code) +static void ve_raise_fault(struct pt_regs *regs, long error_code, + unsigned long address) { if (user_mode(regs)) { gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); return; } - if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, + VE_FAULT_STR, address)) { return; + } - die_addr(VE_FAULT_STR, regs, error_code, 0); + die_addr(VE_FAULT_STR, regs, error_code, address); } /* @@ -1422,7 +1538,7 @@ DEFINE_IDTENTRY(exc_virtualization_exception) * it successfully, treat it as #GP(0) and handle it. */ if (!tdx_handle_virt_exception(regs, &ve)) - ve_raise_fault(regs, 0); + ve_raise_fault(regs, 0, ve.gla); cond_local_irq_disable(regs); } @@ -1451,8 +1567,11 @@ void __init trap_init(void) sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ - cpu_init_exception_handling(); + cpu_init_exception_handling(true); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } |