aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/traps.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/traps.c')
-rw-r--r--arch/x86/kernel/traps.c304
1 files changed, 265 insertions, 39 deletions
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c9d566dcf89a..d3fdec706f1d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
#include <linux/context_tracking.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
+#include <linux/kmsan.h>
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
@@ -39,6 +40,7 @@
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
+#include <linux/ioasid.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
@@ -61,6 +63,8 @@
#include <asm/insn.h>
#include <asm/insn-eval.h>
#include <asm/vdso.h>
+#include <asm/tdx.h>
+#include <asm/cfi.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -209,6 +213,81 @@ DEFINE_IDTENTRY(exc_overflow)
do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
+#ifdef CONFIG_X86_KERNEL_IBT
+
+static __ro_after_init bool ibt_fatal = true;
+
+extern void ibt_selftest_ip(void); /* code label defined in asm below */
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_IBT)) {
+ pr_err("Unexpected #CP\n");
+ BUG();
+ }
+
+ if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR))
+ return;
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) {
+ regs->ax = 0;
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ return;
+ }
+ BUG();
+}
+
+/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */
+noinline bool ibt_selftest(void)
+{
+ unsigned long ret;
+
+ asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t"
+ ANNOTATE_RETPOLINE_SAFE
+ " jmp *%%rax\n\t"
+ "ibt_selftest_ip:\n\t"
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ " nop\n\t"
+
+ : "=a" (ret) : : "memory");
+
+ return !ret;
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
@@ -223,6 +302,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
{
bool handled = false;
+ /*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
if (!is_valid_bugaddr(regs->ip))
return handled;
@@ -236,7 +321,8 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+ handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
regs->ip += LEN_UD2;
handled = true;
}
@@ -559,13 +645,94 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
return true;
}
+/*
+ * The unprivileged ENQCMD instruction generates #GPs if the
+ * IA32_PASID MSR has not been populated. If possible, populate
+ * the MSR from a PASID previously allocated to the mm.
+ */
+static bool try_fixup_enqcmd_gp(void)
+{
+#ifdef CONFIG_IOMMU_SVA
+ u32 pasid;
+
+ /*
+ * MSR_IA32_PASID is managed using XSAVE. Directly
+ * writing to the MSR is only possible when fpregs
+ * are valid and the fpstate is not. This is
+ * guaranteed when handling a userspace exception
+ * in *before* interrupts are re-enabled.
+ */
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Hardware without ENQCMD will not generate
+ * #GPs that can be fixed up here.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
+ return false;
+
+ pasid = current->mm->pasid;
+
+ /*
+ * If the mm has not been allocated a
+ * PASID, the #GP can not be fixed up.
+ */
+ if (!pasid_valid(pasid))
+ return false;
+
+ /*
+ * Did this thread already have its PASID activated?
+ * If so, the #GP must be from something else.
+ */
+ if (current->pasid_activated)
+ return false;
+
+ wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID);
+ current->pasid_activated = 1;
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+static bool gp_try_fixup_and_notify(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ if (fixup_exception(regs, trapnr, error_code, 0))
+ return true;
+
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() && kprobe_running() &&
+ kprobe_fault_handler(regs, trapnr))
+ return true;
+
+ return notify_die(DIE_GPF, str, regs, error_code, trapnr, SIGSEGV) == NOTIFY_STOP;
+}
+
+static void gp_user_force_sig_segv(struct pt_regs *regs, int trapnr,
+ unsigned long error_code, const char *str)
+{
+ current->thread.error_code = error_code;
+ current->thread.trap_nr = trapnr;
+ show_signal(current, SIGSEGV, "", str, regs, error_code);
+ force_sig(SIGSEGV);
+}
+
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
- struct task_struct *tsk;
unsigned long gp_addr;
- int ret;
+
+ if (user_mode(regs) && try_fixup_enqcmd_gp())
+ return;
cond_local_irq_enable(regs);
@@ -581,40 +748,18 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
return;
}
- tsk = current;
-
if (user_mode(regs)) {
if (fixup_iopl_exception(regs))
goto exit;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
goto exit;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
- force_sig(SIGSEGV);
+ gp_user_force_sig_segv(regs, X86_TRAP_GP, error_code, desc);
goto exit;
}
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- goto exit;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
-
- /*
- * To be potentially processing a kprobe fault and to trust the result
- * from kprobe_running(), we have to be non-preemptible.
- */
- if (!preemptible() &&
- kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- goto exit;
-
- ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
- if (ret == NOTIFY_STOP)
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_GP, error_code, desc))
goto exit;
if (error_code)
@@ -659,6 +804,7 @@ static bool do_int3(struct pt_regs *regs)
return res == NOTIFY_STOP;
}
+NOKPROBE_SYMBOL(do_int3);
static void do_int3_user(struct pt_regs *regs)
{
@@ -761,14 +907,10 @@ sync:
}
#endif
-struct bad_iret_stack {
- void *error_entry_ret;
- struct pt_regs regs;
-};
-
-asmlinkage __visible noinstr
-struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs)
{
+ struct pt_regs tmp, *new_stack;
+
/*
* This is called from entry_64.S early in handling a fault
* caused by a bad iret to user mode. To handle the fault
@@ -777,19 +919,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
- struct bad_iret_stack tmp, *new_stack =
- (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
+ new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the temporary storage. */
- __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
+ __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+ __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip));
/* Update the entry stack */
__memcpy(new_stack, &tmp, sizeof(tmp));
- BUG_ON(!user_mode(&new_stack->regs));
+ BUG_ON(!user_mode(new_stack));
return new_stack;
}
#endif
@@ -1212,6 +1353,91 @@ DEFINE_IDTENTRY(exc_device_not_available)
}
}
+#ifdef CONFIG_INTEL_TDX_GUEST
+
+#define VE_FAULT_STR "VE fault"
+
+static void ve_raise_fault(struct pt_regs *regs, long error_code)
+{
+ if (user_mode(regs)) {
+ gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR);
+ return;
+ }
+
+ if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR))
+ return;
+
+ die_addr(VE_FAULT_STR, regs, error_code, 0);
+}
+
+/*
+ * Virtualization Exceptions (#VE) are delivered to TDX guests due to
+ * specific guest actions which may happen in either user space or the
+ * kernel:
+ *
+ * * Specific instructions (WBINVD, for example)
+ * * Specific MSR accesses
+ * * Specific CPUID leaf accesses
+ * * Access to specific guest physical addresses
+ *
+ * In the settings that Linux will run in, virtualization exceptions are
+ * never generated on accesses to normal, TD-private memory that has been
+ * accepted (by BIOS or with tdx_enc_status_changed()).
+ *
+ * Syscall entry code has a critical window where the kernel stack is not
+ * yet set up. Any exception in this window leads to hard to debug issues
+ * and can be exploited for privilege escalation. Exceptions in the NMI
+ * entry code also cause issues. Returning from the exception handler with
+ * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack.
+ *
+ * For these reasons, the kernel avoids #VEs during the syscall gap and
+ * the NMI entry code. Entry code paths do not access TD-shared memory,
+ * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves
+ * that might generate #VE. VMM can remove memory from TD at any point,
+ * but access to unaccepted (or missing) private memory leads to VM
+ * termination, not to #VE.
+ *
+ * Similarly to page faults and breakpoints, #VEs are allowed in NMI
+ * handlers once the kernel is ready to deal with nested NMIs.
+ *
+ * During #VE delivery, all interrupts, including NMIs, are blocked until
+ * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads
+ * the VE info.
+ *
+ * If a guest kernel action which would normally cause a #VE occurs in
+ * the interrupt-disabled region before TDGETVEINFO, a #DF (fault
+ * exception) is delivered to the guest which will result in an oops.
+ *
+ * The entry code has been audited carefully for following these expectations.
+ * Changes in the entry code have to be audited for correctness vs. this
+ * aspect. Similarly to #PF, #VE in these places will expose kernel to
+ * privilege escalation or may lead to random crashes.
+ */
+DEFINE_IDTENTRY(exc_virtualization_exception)
+{
+ struct ve_info ve;
+
+ /*
+ * NMIs/Machine-checks/Interrupts will be in a disabled state
+ * till TDGETVEINFO TDCALL is executed. This ensures that VE
+ * info cannot be overwritten by a nested #VE.
+ */
+ tdx_get_ve_info(&ve);
+
+ cond_local_irq_enable(regs);
+
+ /*
+ * If tdx_handle_virt_exception() could not process
+ * it successfully, treat it as #GP(0) and handle it.
+ */
+ if (!tdx_handle_virt_exception(regs, &ve))
+ ve_raise_fault(regs, 0);
+
+ cond_local_irq_disable(regs);
+}
+
+#endif
+
#ifdef CONFIG_X86_32
DEFINE_IDTENTRY_SW(iret_error)
{