aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/entry/entry_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry/entry_64.S')
-rw-r--r--arch/x86/entry/entry_64.S275
1 files changed, 119 insertions, 156 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9953d966d124..8af2a26b24f6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -8,7 +8,7 @@
*
* entry.S contains the system-call and fault low-level handling routines.
*
- * Some of this is documented in Documentation/x86/entry_64.rst
+ * Some of this is documented in Documentation/arch/x86/entry_64.rst
*
* A note on terminology:
* - iret frame: Architecture defined interrupt frame from SS to RIP
@@ -18,6 +18,7 @@
* - SYM_FUNC_START/END:Define functions in the symbol table.
* - idtentry: Define exception entry points.
*/
+#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
@@ -34,7 +35,6 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
-#include <asm/export.h>
#include <asm/frame.h>
#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
@@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64)
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/
- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
- * restore RF properly. If the slowpath sets it for whatever reason, we
- * need to restore it correctly.
- *
- * SYSRET can restore TF, but unlike IRET, restoring TF results in a
- * trap from userspace immediately after SYSRET. This would cause an
- * infinite loop whenever #DB happens with register state that satisfies
- * the opportunistic SYSRET conditions. For example, single-stepping
- * this user code:
- *
- * movq $stuck_here, %rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
@@ -205,7 +143,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -223,6 +161,7 @@ syscall_return_via_sysret:
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
+ CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -252,7 +191,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data + FIXED_stack_canary)
#endif
/*
@@ -284,34 +223,39 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
- UNWIND_HINT_EMPTY
+SYM_CODE_START(ret_from_fork_asm)
+ /*
+ * This is the start of the kernel stack; even through there's a
+ * register set at the top, the regset isn't necessarily coherent
+ * (consider kthreads) and one cannot unwind further.
+ *
+ * This ensures stack unwinds of kernel threads terminate in a known
+ * good state.
+ */
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
-
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ CALL_DEPTH_ACCOUNT
-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
- jmp swapgs_restore_regs_and_return_to_usermode
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork
-1:
- /* kernel thread */
- UNWIND_HINT_EMPTY
- movq %r12, %rdi
- CALL_NOSPEC rbx
/*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
+ * Set the stack state to what is expected for the target function
+ * -- at this point the register set should be a valid user set
+ * and unwind should work normally.
*/
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+ UNWIND_HINT_REGS
+
+#ifdef CONFIG_X86_FRED
+ ALTERNATIVE "jmp swapgs_restore_regs_and_return_to_usermode", \
+ "jmp asm_fred_exit_user", X86_FEATURE_FRED
+#else
+ jmp swapgs_restore_regs_and_return_to_usermode
+#endif
+SYM_CODE_END(ret_from_fork_asm)
.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
@@ -326,11 +270,12 @@ SYM_CODE_END(ret_from_fork)
#endif
.endm
-SYM_CODE_START_LOCAL(xen_error_entry)
+SYM_CODE_START(xen_error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(xen_error_entry)
@@ -382,7 +327,14 @@ SYM_CODE_END(xen_error_entry)
*/
.macro idtentry vector asmsym cfunc has_error_code:req
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+
+ .if \vector == X86_TRAP_BP
+ /* #BP advances %rip to the next instruction */
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
+ .else
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
+ .endif
+
ENDBR
ASM_CLAC
cld
@@ -425,14 +377,6 @@ SYM_CODE_END(\asmsym)
idtentry \vector asm_\cfunc \cfunc has_error_code=1
.endm
-/*
- * System vectors which invoke their handlers directly and are not
- * going through the regular common device interrupt handling code.
- */
-.macro idtentry_sysvec vector cfunc
- idtentry \vector asm_\cfunc \cfunc has_error_code=0
-.endm
-
/**
* idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
* @vector: Vector number
@@ -451,7 +395,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_mce_db vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -508,7 +452,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -572,7 +516,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_df vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS offset=8
+ UNWIND_HINT_IRET_ENTRY offset=8
ENDBR
ASM_CLAC
cld
@@ -600,13 +544,13 @@ SYM_CODE_END(\asmsym)
* shared between 32 and 64 bit and emit the __irqentry_text_* markers
* so the stacktrace boundary checks work.
*/
- .align 16
+ __ALIGN
.globl __irqentry_text_start
__irqentry_text_start:
#include <asm/idtentry.h>
- .align 16
+ __ALIGN
.globl __irqentry_text_end
__irqentry_text_end:
ANNOTATE_NOENDBR
@@ -614,17 +558,28 @@ __irqentry_text_end:
SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
IBRS_EXIT
-#ifdef CONFIG_DEBUG_ENTRY
- /* Assert that pt_regs indicates user mode. */
- testb $3, CS(%rsp)
- jnz 1f
- ud2
-1:
-#endif
#ifdef CONFIG_XEN_PV
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
#endif
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+ ALTERNATIVE "", "jmp .Lpti_restore_regs_and_return_to_usermode", X86_FEATURE_PTI
+#endif
+
+ STACKLEAK_ERASE
+ POP_REGS
+ add $8, %rsp /* orig_ax */
+ UNWIND_HINT_IRET_REGS
+
+.Lswapgs_and_iret:
+ swapgs
+ CLEAR_CPU_BUFFERS
+ /* Assert that the IRET frame indicates user mode. */
+ testb $3, 8(%rsp)
+ jnz .Lnative_iret
+ ud2
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
+.Lpti_restore_regs_and_return_to_usermode:
POP_REGS pop_rdi=0
/*
@@ -633,7 +588,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -651,13 +606,14 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
STACKLEAK_ERASE_NOCLOBBER
- SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+ push %rax
+ SWITCH_TO_USER_CR3 scratch_reg=%rdi scratch_reg2=%rax
+ pop %rax
/* Restore RDI. */
popq %rdi
- swapgs
- jmp .Lnative_iret
-
+ jmp .Lswapgs_and_iret
+#endif
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
@@ -767,6 +723,8 @@ native_irq_return_ldt:
*/
popq %rax /* Restore user RAX */
+ CLEAR_CPU_BUFFERS
+
/*
* RSP now points to an ordinary IRET frame, except that the page
* is read-only and RSP[31:16] are preloaded with the userspace
@@ -779,7 +737,7 @@ _ASM_NOKPROBE(common_interrupt_return)
/*
* Reload gs selector with exception handling
- * edi: new selector
+ * di: new selector
*
* Is in entry.text as it shouldn't be instrumented.
*/
@@ -828,7 +786,8 @@ EXPORT_SYMBOL(asm_load_gs_index)
*
* C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
+ __FUNC_ALIGN
+SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -856,8 +815,9 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
* We distinguish between categories by comparing each saved segment register
* with its current contents: any discrepancy means we in category 1.
*/
-SYM_CODE_START(xen_failsafe_callback)
- UNWIND_HINT_EMPTY
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(xen_failsafe_callback)
+ UNWIND_HINT_UNDEFINED
ENDBR
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
@@ -903,7 +863,8 @@ SYM_CODE_END(xen_failsafe_callback)
* R14 - old CR3
* R15 - old SPEC_CTRL
*/
-SYM_CODE_START_LOCAL(paranoid_entry)
+SYM_CODE_START(paranoid_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -972,7 +933,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* CR3 above, keep the old value in a callee saved register.
*/
IBRS_ENTER save_reg=%r15
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(paranoid_entry)
@@ -1009,14 +970,14 @@ SYM_CODE_START_LOCAL(paranoid_exit)
IBRS_EXIT save_reg=%r15
/*
- * The order of operations is important. RESTORE_CR3 requires
+ * The order of operations is important. PARANOID_RESTORE_CR3 requires
* kernel GSBASE.
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
- * exceptions go through error_exit instead.
+ * exceptions go through error_return instead.
*/
- RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%rax save_reg=%r14
/* Handle the three GSBASE cases */
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
@@ -1038,7 +999,8 @@ SYM_CODE_END(paranoid_exit)
/*
* Switch GS and CR3 if needed.
*/
-SYM_CODE_START_LOCAL(error_entry)
+SYM_CODE_START(error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
@@ -1056,14 +1018,11 @@ SYM_CODE_START_LOCAL(error_entry)
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
-.Lerror_entry_from_usermode_after_swapgs:
-
/* Put us onto the real thread stack. */
- call sync_regs
- RET
+ jmp sync_regs
/*
* There are two places in the kernel that can potentially fault with
@@ -1094,8 +1053,9 @@ SYM_CODE_START_LOCAL(error_entry)
*/
.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
+ CALL_DEPTH_ACCOUNT
leaq 8(%rsp), %rax /* return pt_regs pointer */
- ANNOTATE_UNRET_END
+ VALIDATE_UNRET_END
RET
.Lbstep_iret:
@@ -1112,7 +1072,7 @@ SYM_CODE_START_LOCAL(error_entry)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1121,7 +1081,7 @@ SYM_CODE_START_LOCAL(error_entry)
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
call fixup_bad_iret
mov %rax, %rdi
- jmp .Lerror_entry_from_usermode_after_swapgs
+ jmp sync_regs
SYM_CODE_END(error_entry)
SYM_CODE_START_LOCAL(error_return)
@@ -1138,10 +1098,10 @@ SYM_CODE_END(error_return)
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
- * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
+ * when MITIGATION_PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
/*
@@ -1154,8 +1114,8 @@ SYM_CODE_START(asm_exc_nmi)
* anyway.
*
* To handle this case we do the following:
- * Check the a special location on the stack that contains
- * a variable that is set when NMIs are executing.
+ * Check a special location on the stack that contains a
+ * variable that is set when NMIs are executing.
* The interrupted task's stack is also checked to see if it
* is an NMI stack.
* If the variable is not set and the stack is not the NMI
@@ -1206,7 +1166,7 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
@@ -1228,7 +1188,6 @@ SYM_CODE_START(asm_exc_nmi)
*/
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/*
@@ -1286,8 +1245,8 @@ SYM_CODE_START(asm_exc_nmi)
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call exc_nmi() anyway, so we can just
- * resume the outer NMI.
+ * about to call exc_nmi() anyway, so we can just resume
+ * the outer NMI.
*/
movq $repeat_nmi, %rdx
@@ -1442,14 +1401,12 @@ end_repeat_nmi:
UNWIND_HINT_REGS
movq %rsp, %rdi
- movq $-1, %rsi
call exc_nmi
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
IBRS_EXIT save_reg=%r15
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+ PARANOID_RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
/*
* The above invocation of paranoid_entry stored the GSBASE
@@ -1494,6 +1451,12 @@ nmi_restore:
movq $0, 5*8(%rsp) /* clear "NMI executing" */
/*
+ * Skip CLEAR_CPU_BUFFERS here, since it only helps in rare cases like
+ * NMI in kernel after user state is restored. For an unprivileged user
+ * these conditions are hard to meet.
+ */
+
+ /*
* iretq reads the "iret" frame and exits the NMI stack in a
* single instruction. We are returning to kernel mode, so this
* cannot result in a fault. Similarly, we don't need to worry
@@ -1502,26 +1465,26 @@ nmi_restore:
iretq
SYM_CODE_END(asm_exc_nmi)
-#ifndef CONFIG_IA32_EMULATION
/*
* This handles SYSCALL from 32-bit code. There is no way to program
* MSRs to fully disable 32-bit SYSCALL.
*/
-SYM_CODE_START(ignore_sysret)
- UNWIND_HINT_EMPTY
+SYM_CODE_START(entry_SYSCALL32_ignore)
+ UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
+ CLEAR_CPU_BUFFERS
sysretl
-SYM_CODE_END(ignore_sysret)
-#endif
+SYM_CODE_END(entry_SYSCALL32_ignore)
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_and_make_dead)
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS