From 9e97b73fdb235345a826519862a52a7398c89eb8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:38 +0200 Subject: x86/asm-offsets: Move TSS_sp0 and TSS_sp1 to asm-offsets.c These offsets will be used in 32 bit assembly code as well, so make them available for all of x86 code. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Reviewed-by: Andy Lutomirski Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-2-git-send-email-joro@8bytes.org --- arch/x86/kernel/asm-offsets.c | 4 ++++ arch/x86/kernel/asm-offsets_64.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index dcb008c320fe..a1e16286a832 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,4 +103,8 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + + /* Offset for sp0 and sp1 into the tss_struct */ + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index b2dcd161f514..3b9405e7ba2b 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -65,8 +65,6 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); - OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); - OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_STACKPROTECTOR -- cgit v1.2.3-59-g8ed1b From ae2e565bc6aaee3f3db420fec5fdd39755c9813e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:39 +0200 Subject: x86/entry/32: Rename TSS_sysenter_sp0 to TSS_entry2task_stack The stack address doesn't need to be stored in tss.sp0 if the stack is switched manually like on sysenter. Rename the offset so that it still makes sense when its location is changed in later patches. This stackk will also be used for all kernel-entry points, not just sysenter. Reflect that and the fact that it is the offset to the task-stack location in the name as well. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-3-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 2 +- arch/x86/kernel/asm-offsets_32.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c371bfee137a..39f711ad0fc9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -412,7 +412,7 @@ ENTRY(xen_sysenter_target) * 0(%ebp) arg6 */ ENTRY(entry_SYSENTER_32) - movl TSS_sysenter_sp0(%esp), %esp + movl TSS_entry2task_stack(%esp), %esp .Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index a4a3be399f4b..15b3f45b69cd 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,8 +46,9 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + /* Offset from the entry stack to task stack stored in TSS */ + DEFINE(TSS_entry2task_stack, + offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR -- cgit v1.2.3-59-g8ed1b From a6b744f3ce9d017dd86b28355de2d8e0d36496d4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:40 +0200 Subject: x86/entry/32: Load task stack from x86_tss.sp1 in SYSENTER handler x86_tss.sp0 will be used to point to the entry stack later to use it as a trampoline stack for other kernel entry points besides SYSENTER. So store the real task stack pointer in x86_tss.sp1, which is otherwise unused by the hardware, as Linux doesn't make use of Ring 1. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-4-git-send-email-joro@8bytes.org --- arch/x86/kernel/asm-offsets_32.c | 9 +++++++-- arch/x86/kernel/process_32.c | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 15b3f45b69cd..82826f2275cc 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -46,9 +46,14 @@ void foo(void) OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); BLANK(); - /* Offset from the entry stack to task stack stored in TSS */ + /* + * Offset from the entry stack to task stack stored in TSS. Kernel entry + * happens on the per-cpu entry-stack, and the asm code switches to the + * task-stack pointer stored in x86_tss.sp1, which is a copy of + * task->thread.sp0 where entry code can find it. + */ DEFINE(TSS_entry2task_stack, - offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetof(struct cpu_entry_area, tss.x86_tss.sp1) - offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..ec62cc77388d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,6 +290,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); + /* SYSENTER reads the task-stack from tss.sp1 */ + this_cpu_write(cpu_tss_rw.x86_tss.sp1, next_p->thread.sp0); /* * Restore %gs if needed (which is common) -- cgit v1.2.3-59-g8ed1b From 45d7b255747c21fc4b1f5043bee0754d39c3bdbf Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:44 +0200 Subject: x86/entry/32: Enter the kernel via trampoline stack Use the entry-stack as a trampoline to enter the kernel. The entry-stack is already in the cpu_entry_area and will be mapped to userspace when PTI is enabled. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-8-git-send-email-joro@8bytes.org --- arch/x86/entry/entry_32.S | 119 ++++++++++++++++++++++++++++++++------- arch/x86/include/asm/switch_to.h | 14 ++++- arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/cpu/common.c | 5 +- arch/x86/kernel/process.c | 2 - arch/x86/kernel/process_32.c | 2 - 6 files changed, 115 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7251c4f3e99e..fea49ec345ba 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -154,7 +154,7 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL pt_regs_ax=%eax +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS pushl %fs @@ -173,6 +173,12 @@ movl $(__KERNEL_PERCPU), %edx movl %edx, %fs SET_KERNEL_GS %edx + + /* Switch to kernel stack if necessary */ +.if \switch_stacks > 0 + SWITCH_TO_KERNEL_STACK +.endif + .endm /* @@ -269,6 +275,73 @@ .Lend_\@: #endif /* CONFIG_X86_ESPFIX32 */ .endm + + +/* + * Called with pt_regs fully populated and kernel segments loaded, + * so we can access PER_CPU and use the integer registers. + * + * We need to be very careful here with the %esp switch, because an NMI + * can happen everywhere. If the NMI handler finds itself on the + * entry-stack, it will overwrite the task-stack and everything we + * copied there. So allocate the stack-frame on the task-stack and + * switch to it before we do any copying. + */ +.macro SWITCH_TO_KERNEL_STACK + + ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV + + /* Are we on the entry stack? Bail out if not! */ + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %esp, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx + jae .Lend_\@ + + /* Load stack pointer into %esi and %edi */ + movl %esp, %esi + movl %esi, %edi + + /* Move %edi to the top of the entry stack */ + andl $(MASK_entry_stack), %edi + addl $(SIZEOF_entry_stack), %edi + + /* Load top of task-stack into %edi */ + movl TSS_entry2task_stack(%edi), %edi + + /* Bytes to copy */ + movl $PTREGS_SIZE, %ecx + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esi) + jz .Lcopy_pt_regs_\@ + + /* + * Stack-frame contains 4 additional segment registers when + * coming from VM86 mode + */ + addl $(4 * 4), %ecx + +.Lcopy_pt_regs_\@: +#endif + + /* Allocate frame on task-stack */ + subl %ecx, %edi + + /* Switch to task-stack */ + movl %edi, %esp + + /* + * We are now on the task-stack and can safely copy over the + * stack-frame + */ + shrl $2, %ecx + cld + rep movsl + +.Lend_\@: +.endm + /* * %eax: prev task * %edx: next task @@ -469,7 +542,7 @@ ENTRY(entry_SYSENTER_32) pushl $__USER_CS /* pt_regs->cs */ pushl $0 /* pt_regs->ip = 0 (placeholder) */ pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, stack already switched */ /* * SYSENTER doesn't filter flags, so we need to clear NT, AC @@ -580,7 +653,8 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + + SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -677,7 +751,8 @@ END(irq_entries_start) common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax @@ -685,16 +760,16 @@ common_interrupt: jmp ret_from_intr ENDPROC(common_interrupt) -#define BUILD_INTERRUPT3(name, nr, fn) \ -ENTRY(name) \ - ASM_CLAC; \ - pushl $~(nr); \ - SAVE_ALL; \ - ENCODE_FRAME_POINTER; \ - TRACE_IRQS_OFF \ - movl %esp, %eax; \ - call fn; \ - jmp ret_from_intr; \ +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + ASM_CLAC; \ + pushl $~(nr); \ + SAVE_ALL switch_stacks=1; \ + ENCODE_FRAME_POINTER; \ + TRACE_IRQS_OFF \ + movl %esp, %eax; \ + call fn; \ + jmp ret_from_intr; \ ENDPROC(name) #define BUILD_INTERRUPT(name, nr) \ @@ -926,16 +1001,20 @@ common_exception: pushl %es pushl %ds pushl %eax + movl $(__USER_DS), %eax + movl %eax, %ds + movl %eax, %es + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs pushl %ebp pushl %edi pushl %esi pushl %edx pushl %ecx pushl %ebx + SWITCH_TO_KERNEL_STACK ENCODE_FRAME_POINTER cld - movl $(__KERNEL_PERCPU), %ecx - movl %ecx, %fs UNWIND_ESPFIX_STACK GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address @@ -943,9 +1022,6 @@ common_exception: movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi @@ -964,6 +1040,7 @@ ENTRY(debug) */ ASM_CLAC pushl $-1 # mark this as an int + SAVE_ALL ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 @@ -999,6 +1076,7 @@ END(debug) */ ENTRY(nmi) ASM_CLAC + #ifdef CONFIG_X86_ESPFIX32 pushl %eax movl %ss, %eax @@ -1066,7 +1144,8 @@ END(nmi) ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int - SAVE_ALL + + SAVE_ALL switch_stacks=1 ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index eb5f7999a893..8bc2f70b452e 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -89,13 +89,23 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) /* This is used when switching tasks or entering/exiting vm86 mode. */ static inline void update_sp0(struct task_struct *task) { - /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ + /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 - load_sp0(task->thread.sp0); + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task->thread.sp0); + else + this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0); #else + /* + * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That + * doesn't work on x86-32 because sp1 and + * cpu_current_top_of_stack have different values (because of + * the non-zero stack-padding on 32bit). + */ if (static_cpu_has(X86_FEATURE_XENPV)) load_sp0(task_top_of_stack(task)); #endif + } #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a1e16286a832..01de31db300d 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -103,6 +103,7 @@ void common(void) { OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); + DEFINE(MASK_entry_stack, (~(sizeof(struct entry_stack) - 1))); /* Offset for sp0 and sp1 into the tss_struct */ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eb4cb3efd20e..43a927eb9c09 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1804,11 +1804,12 @@ void cpu_init(void) enter_lazy_tlb(&init_mm, curr); /* - * Initialize the TSS. Don't bother initializing sp0, as the initial - * task never enters user mode. + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. */ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 30ca2d1a9231..c93fcfdf1673 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,14 +57,12 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, -#ifdef CONFIG_X86_64 /* * .sp1 is cpu_current_top_of_stack. The init task never * runs user code, but cpu_current_top_of_stack should still * be well defined before the first context switch. */ .sp1 = TOP_OF_INIT_STACK, -#endif #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ec62cc77388d..0ae659de21eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,8 +290,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); - /* SYSENTER reads the task-stack from tss.sp1 */ - this_cpu_write(cpu_tss_rw.x86_tss.sp1, next_p->thread.sp0); /* * Restore %gs if needed (which is common) -- cgit v1.2.3-59-g8ed1b From 252e1a0526304f0f3f6888fc09e81cb220f957f3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:51 +0200 Subject: x86/entry: Rename update_sp0 to update_task_stack The function does not update sp0 anymore but updates makes the task-stack visible for entry code. This is by either writing it to sp1 or by doing a hypercall. Rename the function to get rid of the misleading name. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-15-git-send-email-joro@8bytes.org --- arch/x86/include/asm/switch_to.h | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8bc2f70b452e..36bd243843d6 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -87,7 +87,7 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread) #endif /* This is used when switching tasks or entering/exiting vm86 mode. */ -static inline void update_sp0(struct task_struct *task) +static inline void update_task_stack(struct task_struct *task) { /* sp0 always points to the entry trampoline stack, which is constant: */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0ae659de21eb..2924fd447e61 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -285,7 +285,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ - update_sp0(next_p); + update_task_stack(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 12bb445fb98d..476e3ddf8890 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -478,7 +478,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ - update_sp0(next_p); + update_task_stack(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9d0b5af7db91..1c03e4aa6474 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -149,7 +149,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) preempt_disable(); tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; - update_sp0(tsk); + update_task_stack(tsk); refresh_sysenter_cs(&tsk->thread); vm86->saved_sp0 = 0; preempt_enable(); @@ -374,7 +374,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) refresh_sysenter_cs(&tsk->thread); } - update_sp0(tsk); + update_task_stack(tsk); preempt_enable(); if (vm86->flags & VM86_SCREEN_BITMAP) -- cgit v1.2.3-59-g8ed1b From e3238faf20fb1b51a814497751398ab525a2c884 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:40:54 +0200 Subject: x86/pgtable/32: Allocate 8k page-tables when PTI is enabled Allocate a kernel and a user page-table root when PTI is enabled. Also allocate a full page per root for PAE because otherwise the bit to flip in CR3 to switch between them would be non-constant, which creates a lot of hassle. Keep that for a later optimization. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-18-git-send-email-joro@8bytes.org --- arch/x86/kernel/head_32.S | 20 +++++++++++++++----- arch/x86/mm/pgtable.c | 5 +++-- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index abe6df15a8fb..30f9cb2c0b55 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -512,11 +512,18 @@ ENTRY(initial_code) ENTRY(setup_once_ref) .long setup_once +#ifdef CONFIG_PAGE_TABLE_ISOLATION +#define PGD_ALIGN (2 * PAGE_SIZE) +#define PTI_USER_PGD_FILL 1024 +#else +#define PGD_ALIGN (PAGE_SIZE) +#define PTI_USER_PGD_FILL 0 +#endif /* * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE + .align PGD_ALIGN #ifdef CONFIG_X86_PAE .globl initial_pg_pmd initial_pg_pmd: @@ -526,14 +533,17 @@ initial_pg_pmd: initial_page_table: .fill 1024,4,0 #endif + .align PGD_ALIGN initial_pg_fixmap: .fill 1024,4,0 -.globl empty_zero_page -empty_zero_page: - .fill 4096,1,0 .globl swapper_pg_dir + .align PGD_ALIGN swapper_pg_dir: .fill 1024,4,0 + .fill PTI_USER_PGD_FILL,4,0 +.globl empty_zero_page +empty_zero_page: + .fill 4096,1,0 EXPORT_SYMBOL(empty_zero_page) /* @@ -542,7 +552,7 @@ EXPORT_SYMBOL(empty_zero_page) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE + .align PGD_ALIGN ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 47b5951e592b..db6fb7740bf7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -343,7 +343,8 @@ static inline pgd_t *_pgd_alloc(void) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return (pgd_t *)__get_free_page(PGALLOC_GFP); + return (pgd_t *)__get_free_pages(PGALLOC_GFP, + PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -355,7 +356,7 @@ static inline pgd_t *_pgd_alloc(void) static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } -- cgit v1.2.3-59-g8ed1b From 39d668e04edad25abe184fb329ce35a131146ee5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:04 +0200 Subject: x86/mm/pti: Make pti_clone_kernel_text() compile on 32 bit The pti_clone_kernel_text() function references __end_rodata_hpage_align, which is only present on x86-64. This makes sense as the end of the rodata section is not huge-page aligned on 32 bit. Nevertheless a symbol is required for the function that points at the right address for both 32 and 64 bit. Introduce __end_rodata_aligned for that purpose and use it in pti_clone_kernel_text(). Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-28-git-send-email-joro@8bytes.org --- arch/x86/include/asm/sections.h | 1 + arch/x86/kernel/vmlinux.lds.S | 17 ++++++++++------- arch/x86/mm/pti.c | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..4a911a382ade 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -7,6 +7,7 @@ extern char __brk_base[], __brk_limit[]; extern struct exception_table_entry __stop___ex_table[]; +extern char __end_rodata_aligned[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5e1458f609a1..8bde0a419f86 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -55,19 +55,22 @@ jiffies_64 = jiffies; * so we can enable protection checks as well as retain 2MB large page * mappings for kernel text. */ -#define X64_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); +#define X86_ALIGN_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); -#define X64_ALIGN_RODATA_END \ +#define X86_ALIGN_RODATA_END \ . = ALIGN(HPAGE_SIZE); \ - __end_rodata_hpage_align = .; + __end_rodata_hpage_align = .; \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); #else -#define X64_ALIGN_RODATA_BEGIN -#define X64_ALIGN_RODATA_END +#define X86_ALIGN_RODATA_BEGIN +#define X86_ALIGN_RODATA_END \ + . = ALIGN(PAGE_SIZE); \ + __end_rodata_aligned = .; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END @@ -141,9 +144,9 @@ SECTIONS /* .text should occupy whole number of pages */ . = ALIGN(PAGE_SIZE); - X64_ALIGN_RODATA_BEGIN + X86_ALIGN_RODATA_BEGIN RO_DATA(PAGE_SIZE) - X64_ALIGN_RODATA_END + X86_ALIGN_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a594e3b6401a..453d23760941 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -470,7 +470,7 @@ void pti_clone_kernel_text(void) * clone the areas past rodata, they might contain secrets. */ unsigned long start = PFN_ALIGN(_text); - unsigned long end = (unsigned long)__end_rodata_hpage_align; + unsigned long end = (unsigned long)__end_rodata_aligned; if (!pti_kernel_image_global_ok()) return; -- cgit v1.2.3-59-g8ed1b From 8195d869d118bc30bf0be8d0c5d8849d6f58529b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:11 +0200 Subject: x86/ldt: Define LDT_END_ADDR It marks the end of the address-space range reserved for the LDT. The LDT-code will use it when unmapping the LDT for user-space. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-35-git-send-email-joro@8bytes.org --- arch/x86/include/asm/pgtable_32_types.h | 2 ++ arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/kernel/ldt.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index 7297810747cf..b0bc0fff5f1f 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -53,6 +53,8 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) +#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE) + #define PKMAP_BASE \ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 066e0ab9ffab..04edd2d58211 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -115,6 +115,7 @@ extern unsigned int ptrs_per_p4d; #define LDT_PGD_ENTRY_L5 -112UL #define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) +#define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) #define __VMALLOC_BASE_L4 0xffffc90000000000UL #define __VMALLOC_BASE_L5 0xffa0000000000000UL diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index c9b14020f4dd..e921b3d494d5 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -206,7 +206,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) #ifdef CONFIG_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; - unsigned long end = start + (1UL << PGDIR_SHIFT); + unsigned long end = LDT_END_ADDR; if (!static_cpu_has(X86_FEATURE_PTI)) return; -- cgit v1.2.3-59-g8ed1b From 9bae3197e15dd5e03ce8e237db6fe4486b08a775 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:12 +0200 Subject: x86/ldt: Split out sanity check in map_ldt_struct() This splits out the mapping sanity check and the actual mapping of the LDT to user-space from the map_ldt_struct() function in a way so that it is re-usable for PAE paging. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-36-git-send-email-joro@8bytes.org --- arch/x86/kernel/ldt.c | 82 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index e921b3d494d5..69af9a0d57b7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -100,6 +100,49 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +static void do_sanity_check(struct mm_struct *mm, + bool had_kernel_mapping, + bool had_user_mapping) +{ + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!had_user_mapping); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_kernel_mapping); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(had_user_mapping); + } +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pgd(kernel_to_user_pgdp(pgd), *pgd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); + bool had_kernel = (pgd->pgd != 0); + bool had_user = (kernel_to_user_pgdp(pgd)->pgd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. @@ -115,9 +158,8 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION - bool is_vmalloc, had_top_level_entry; unsigned long va; + bool is_vmalloc; spinlock_t *ptl; pgd_t *pgd; int i; @@ -131,13 +173,15 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) */ WARN_ON(ldt->slot != -1); + /* Check if the current mappings are sane */ + sanity_check_ldt_mapping(mm); + /* * Did we already have the top level entry allocated? We can't * use pgd_none() for this because it doens't do anything on * 4-level page table kernels. */ pgd = pgd_offset(mm, LDT_BASE_ADDR); - had_top_level_entry = (pgd->pgd != 0); is_vmalloc = is_vmalloc_addr(ldt->entries); @@ -172,35 +216,25 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) pte_unmap_unlock(ptep, ptl); } - if (mm->context.ldt) { - /* - * We already had an LDT. The top-level entry should already - * have been allocated and synchronized with the usermode - * tables. - */ - WARN_ON(!had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) - WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); - } else { - /* - * This is the first time we're mapping an LDT for this process. - * Sync the pgd to the usermode tables. - */ - WARN_ON(had_top_level_entry); - if (static_cpu_has(X86_FEATURE_PTI)) { - WARN_ON(kernel_to_user_pgdp(pgd)->pgd); - set_pgd(kernel_to_user_pgdp(pgd), *pgd); - } - } + /* Propagate LDT mapping to the user page-table */ + map_ldt_struct_to_user(mm); va = (unsigned long)ldt_slot_va(slot); flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); ldt->slot = slot; -#endif return 0; } +#else /* !CONFIG_PAGE_TABLE_ISOLATION */ + +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ + return 0; +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + static void free_ldt_pgtables(struct mm_struct *mm) { #ifdef CONFIG_PAGE_TABLE_ISOLATION -- cgit v1.2.3-59-g8ed1b From 6df934b92a549cb3badb6d576f71aeb133e2f110 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 18 Jul 2018 11:41:13 +0200 Subject: x86/ldt: Enable LDT user-mapping for PAE This adds the needed special case for PAE to get the LDT mapped into the user page-table when PTI is enabled. The big difference to the other paging modes is that on PAE there is no full top-level PGD entry available for the LDT, but only a PMD entry. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: Pavel Machek Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: "David H . Gutteridge" Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1531906876-13451-37-git-send-email-joro@8bytes.org --- arch/x86/include/asm/mmu_context.h | 5 ---- arch/x86/kernel/ldt.c | 53 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index bbc796eb0a3b..eeeb9289c764 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -71,12 +71,7 @@ struct ldt_struct { static inline void *ldt_slot_va(int slot) { -#ifdef CONFIG_X86_64 return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); -#else - BUG(); - return (void *)fix_to_virt(FIX_HOLE); -#endif } /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 69af9a0d57b7..733e6ace0fa4 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -126,6 +126,57 @@ static void do_sanity_check(struct mm_struct *mm, } } +#ifdef CONFIG_X86_PAE + +static pmd_t *pgd_to_pmd_walk(pgd_t *pgd, unsigned long va) +{ + p4d_t *p4d; + pud_t *pud; + + if (pgd->pgd == 0) + return NULL; + + p4d = p4d_offset(pgd, va); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, va); + if (pud_none(*pud)) + return NULL; + + return pmd_offset(pud, va); +} + +static void map_ldt_struct_to_user(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + + if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + set_pmd(u_pmd, *k_pmd); +} + +static void sanity_check_ldt_mapping(struct mm_struct *mm) +{ + pgd_t *k_pgd = pgd_offset(mm, LDT_BASE_ADDR); + pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); + bool had_kernel, had_user; + pmd_t *k_pmd, *u_pmd; + + k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); + u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); + had_kernel = (k_pmd->pmd != 0); + had_user = (u_pmd->pmd != 0); + + do_sanity_check(mm, had_kernel, had_user); +} + +#else /* !CONFIG_X86_PAE */ + static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); @@ -143,6 +194,8 @@ static void sanity_check_ldt_mapping(struct mm_struct *mm) do_sanity_check(mm, had_kernel, had_user); } +#endif /* CONFIG_X86_PAE */ + /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. -- cgit v1.2.3-59-g8ed1b From ca38dc8f2724d101038b1205122c93a1c7f38f11 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Jul 2018 17:48:03 +0200 Subject: x86/kexec: Allocate 8k PGDs for PTI Fuzzing the PTI-x86-32 code with trinity showed unhandled kernel paging request oops-messages that looked a lot like silent data corruption. Lot's of debugging and testing lead to the kexec-32bit code, which is still allocating 4k PGDs when PTI is enabled. But since it uses native_set_pud() to build the page-table, it will unevitably call into __pti_set_user_pgtbl(), which writes beyond the allocated 4k page. Use PGD_ALLOCATION_ORDER to allocate PGDs in the kexec code to fix the issue. Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Tested-by: David H. Gutteridge Cc: "H . Peter Anvin" Cc: linux-mm@kvack.org Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Dave Hansen Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Jiri Kosina Cc: Boris Ostrovsky Cc: Brian Gerst Cc: David Laight Cc: Denys Vlasenko Cc: Eduardo Valentin Cc: Greg KH Cc: Will Deacon Cc: aliguori@amazon.com Cc: daniel.gruss@iaik.tugraz.at Cc: hughd@google.com Cc: keescook@google.com Cc: Andrea Arcangeli Cc: Waiman Long Cc: Pavel Machek Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: joro@8bytes.org Link: https://lkml.kernel.org/r/1532533683-5988-4-git-send-email-joro@8bytes.org --- arch/x86/kernel/machine_kexec_32.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index d1ab07ec8c9a..5409c2800ab5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -56,7 +56,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_page((unsigned long)image->arch.pgd); + free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -72,7 +72,8 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { - image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + PGD_ALLOCATION_ORDER); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); -- cgit v1.2.3-59-g8ed1b From fdf82a7856b32d905c39afc85e34364491e46346 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 26 Jul 2018 13:14:55 +0200 Subject: x86/speculation: Protect against userspace-userspace spectreRSB The article "Spectre Returns! Speculation Attacks using the Return Stack Buffer" [1] describes two new (sub-)variants of spectrev2-like attacks, making use solely of the RSB contents even on CPUs that don't fallback to BTB on RSB underflow (Skylake+). Mitigate userspace-userspace attacks by always unconditionally filling RSB on context switch when the generic spectrev2 mitigation has been enabled. [1] https://arxiv.org/pdf/1807.07940.pdf Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Acked-by: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Borislav Petkov Cc: David Woodhouse Cc: Peter Zijlstra Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/nycvar.YFH.7.76.1807261308190.997@cbobk.fhfr.pm --- arch/x86/kernel/cpu/bugs.c | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 5c0ea39311fe..bc8c43b22460 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -313,23 +313,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return cmd; } -/* Check for Skylake-like CPUs (for RSB handling) */ -static bool __init is_skylake_era(void) -{ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - return true; - } - } - return false; -} - static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -390,22 +373,15 @@ retpoline_auto: pr_info("%s\n", spectre_v2_strings[mode]); /* - * If neither SMEP nor PTI are available, there is a risk of - * hitting userspace addresses in the RSB after a context switch - * from a shallow call stack to a deeper one. To prevent this fill - * the entire RSB, even when using IBRS. + * If spectre v2 protection has been enabled, unconditionally fill + * RSB during a context switch; this protects against two independent + * issues: * - * Skylake era CPUs have a separate issue with *underflow* of the - * RSB, when they will predict 'ret' targets from the generic BTB. - * The proper mitigation for this is IBRS. If IBRS is not supported - * or deactivated in favour of retpolines the RSB fill on context - * switch is required. + * - RSB underflow (and switch to BTB) on Skylake+ + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs */ - if ((!boot_cpu_has(X86_FEATURE_PTI) && - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n"); - } + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); /* Initialize Indirect Branch Prediction Barrier if supported */ if (boot_cpu_has(X86_FEATURE_IBPB)) { -- cgit v1.2.3-59-g8ed1b From 706d51681d636a0c4a5ef53395ec3b803e45ed4d Mon Sep 17 00:00:00 2001 From: Sai Praneeth Date: Wed, 1 Aug 2018 11:42:25 -0700 Subject: x86/speculation: Support Enhanced IBRS on future CPUs Future Intel processors will support "Enhanced IBRS" which is an "always on" mode i.e. IBRS bit in SPEC_CTRL MSR is enabled once and never disabled. From the specification [1]: "With enhanced IBRS, the predicted targets of indirect branches executed cannot be controlled by software that was executed in a less privileged predictor mode or on another logical processor. As a result, software operating on a processor with enhanced IBRS need not use WRMSR to set IA32_SPEC_CTRL.IBRS after every transition to a more privileged predictor mode. Software can isolate predictor modes effectively simply by setting the bit once. Software need not disable enhanced IBRS prior to entering a sleep state such as MWAIT or HLT." If Enhanced IBRS is supported by the processor then use it as the preferred spectre v2 mitigation mechanism instead of Retpoline. Intel's Retpoline white paper [2] states: "Retpoline is known to be an effective branch target injection (Spectre variant 2) mitigation on Intel processors belonging to family 6 (enumerated by the CPUID instruction) that do not have support for enhanced IBRS. On processors that support enhanced IBRS, it should be used for mitigation instead of retpoline." The reason why Enhanced IBRS is the recommended mitigation on processors which support it is that these processors also support CET which provides a defense against ROP attacks. Retpoline is very similar to ROP techniques and might trigger false positives in the CET defense. If Enhanced IBRS is selected as the mitigation technique for spectre v2, the IBRS bit in SPEC_CTRL MSR is set once at boot time and never cleared. Kernel also has to make sure that IBRS bit remains set after VMEXIT because the guest might have cleared the bit. This is already covered by the existing x86_spec_ctrl_set_guest() and x86_spec_ctrl_restore_host() speculation control functions. Enhanced IBRS still requires IBPB for full mitigation. [1] Speculative-Execution-Side-Channel-Mitigations.pdf [2] Retpoline-A-Branch-Target-Injection-Mitigation.pdf Both documents are available at: https://bugzilla.kernel.org/show_bug.cgi?id=199511 Originally-by: David Woodhouse Signed-off-by: Sai Praneeth Prakhya Signed-off-by: Thomas Gleixner Cc: Tim C Chen Cc: Dave Hansen Cc: Ravi Shankar Link: https://lkml.kernel.org/r/1533148945-24095-1-git-send-email-sai.praneeth.prakhya@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 1 + arch/x86/kernel/cpu/bugs.c | 20 ++++++++++++++++++-- arch/x86/kernel/cpu/common.c | 3 +++ 4 files changed, 23 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5701f5cecd31..2687cd8e8d58 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -219,6 +219,7 @@ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+29) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index c99082e2ef13..fd2a8c1b88bc 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -214,6 +214,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_MINIMAL_AMD, SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS_ENHANCED, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bc8c43b22460..405a9a61bb89 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -130,6 +130,7 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt @@ -332,6 +333,13 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_IBRS_ENHANCED; + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + goto specv2_set_mode; + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -369,6 +377,7 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } +specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -391,9 +400,16 @@ retpoline_auto: /* * Retpoline means the kernel is safe because it has no indirect - * branches. But firmware isn't, so use IBRS to protect that. + * branches. Enhanced IBRS protects firmware too, so, enable restricted + * speculation around firmware calls only when Enhanced IBRS isn't + * supported. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 43a927eb9c09..df28e931d732 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1005,6 +1005,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (x86_match_cpu(cpu_no_meltdown)) return; -- cgit v1.2.3-59-g8ed1b