aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/entry/common.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry/common.c')
-rw-r--r--arch/x86/entry/common.c208
1 files changed, 173 insertions, 35 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..6356060caaf3 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,12 +19,14 @@
#include <linux/nospec.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/init.h>
#ifdef CONFIG_XEN_PV
#include <xen/xen-ops.h>
#include <xen/events.h>
#endif
+#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -70,7 +72,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}
-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -84,6 +87,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
instrumentation_end();
syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
+ */
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif
@@ -96,6 +139,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
return (int)regs->orig_ax;
}
+#ifdef CONFIG_IA32_EMULATION
+bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
+
+static int ia32_emulation_override_cmdline(char *arg)
+{
+ return kstrtobool(arg, &__ia32_enabled);
+}
+early_param("ia32_emulation", ia32_emulation_override_cmdline);
+#endif
+
/*
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
*/
@@ -115,7 +168,96 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
}
}
-/* Handles int $0x80 */
+#ifdef CONFIG_IA32_EMULATION
+static __always_inline bool int80_is_external(void)
+{
+ const unsigned int offs = (0x80 / 32) * 0x10;
+ const u32 bit = BIT(0x80 % 32);
+
+ /* The local APIC on XENPV guests is fake */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /*
+ * If vector 0x80 is set in the APIC ISR then this is an external
+ * interrupt. Either from broken hardware or injected by a VMM.
+ *
+ * Note: In guest mode this is only valid for secure guests where
+ * the secure module fully controls the vAPIC exposed to the guest.
+ */
+ return apic_read(APIC_ISR + offs) & bit;
+}
+
+/**
+ * int80_emulation - 32-bit legacy syscall entry
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls. Instances of INT $0x80 can be found inline in
+ * various programs and libraries. It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method. Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path. It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * The arguments for the INT $0x80 based syscall are on stack in the
+ * pt_regs structure:
+ * eax: system call number
+ * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6
+ */
+DEFINE_IDTENTRY_RAW(int80_emulation)
+{
+ int nr;
+
+ /* Kernel does not use INT $0x80! */
+ if (unlikely(!user_mode(regs))) {
+ irqentry_enter(regs);
+ instrumentation_begin();
+ panic("Unexpected external interrupt 0x80\n");
+ }
+
+ /*
+ * Establish kernel context for instrumentation, including for
+ * int80_is_external() below which calls into the APIC driver.
+ * Identical for soft and external interrupts.
+ */
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ add_random_kstack_offset();
+
+ /* Validate that this is a soft interrupt to the extent possible */
+ if (unlikely(int80_is_external()))
+ panic("Unexpected external interrupt 0x80\n");
+
+ /*
+ * The low level idtentry code pushed -1 into regs::orig_ax
+ * and regs::ax contains the syscall number.
+ *
+ * User tracing code (ptrace or signal handlers) might assume
+ * that the regs::orig_ax contains a 32-bit number on invoking
+ * a 32-bit syscall.
+ *
+ * Establish the syscall convention by saving the 32bit truncated
+ * syscall number in regs::orig_ax and by invalidating regs::ax.
+ */
+ regs->orig_ax = regs->ax & GENMASK(31, 0);
+ regs->ax = -ENOSYS;
+
+ nr = syscall_32_enter(regs);
+
+ local_irq_enable();
+ nr = syscall_enter_from_user_mode_work(regs, nr);
+ do_syscall_32_irqs_on(regs, nr);
+
+ instrumentation_end();
+ syscall_exit_to_user_mode(regs);
+}
+#else /* CONFIG_IA32_EMULATION */
+
+/* Handles int $0x80 on a 32bit kernel */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
int nr = syscall_32_enter(regs);
@@ -134,6 +276,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
+#endif /* !CONFIG_IA32_EMULATION */
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
@@ -182,8 +325,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return true;
}
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
@@ -201,41 +344,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
/* Invoke the syscall. If it failed, keep it simple: use IRET. */
if (!__do_fast_syscall_32(regs))
- return 0;
+ return false;
-#ifdef CONFIG_X86_64
- /*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- */
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
/*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
*/
- return static_cpu_has(X86_FEATURE_SEP) &&
- regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
}
-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
{
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
regs->sp = regs->bp;
@@ -294,7 +432,7 @@ static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}