From c7ca0b614513afba57824cae68447f9c32b1ee61 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 15 Jul 2019 07:21:44 -0700 Subject: Revert "x86/ptrace: Prevent ptrace from clearing the FS/GS selector" and fix the test This reverts commit 48f5e52e916b55fb73754833efbacc7f8081a159. The ptrace ABI change was a prerequisite to the proposed design for FSGSBASE. Since FSGSBASE support has been reverted, and since I'm not convinced that the ABI was ever adequately tested, revert the ABI change as well. This also modifies the test case so that it tests the preexisting behavior. Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/fca39c478ea7fb15bc76fe8a36bd180810a067f6.1563200250.git.luto@kernel.org --- arch/x86/kernel/ptrace.c | 14 ++++++++++++-- tools/testing/selftests/x86/fsgsbase.c | 22 ++++------------------ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 71691a8310e7..0fdbe89d0754 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -369,12 +369,22 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - x86_fsbase_write_task(child, value); + /* + * When changing the FS base, use do_arch_prctl_64() + * to set the index to zero and to set the base + * as requested. + */ + if (child->thread.fsbase != value) + return do_arch_prctl_64(child, ARCH_SET_FS, value); return 0; case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ if (value >= TASK_SIZE_MAX) return -EIO; - x86_gsbase_write_task(child, value); + if (child->thread.gsbase != value) + return do_arch_prctl_64(child, ARCH_SET_GS, value); return 0; #endif } diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 5ab4c60c100e..15a329da59fa 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,25 +489,11 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs != *shared_scratch) { - nerrs++; - printf("[FAIL]\tGS changed to %lx\n", gs); - - /* - * On older kernels, poking a nonzero value into the - * base would zero the selector. On newer kernels, - * this behavior has changed -- poking the base - * changes only the base and, if FSGSBASE is not - * available, this may have no effect. - */ - if (gs == 0) - printf("\tNote: this is expected behavior on older kernels.\n"); - } else if (have_fsgsbase && (base != 0xFF)) { - nerrs++; - printf("[FAIL]\tGSBASE changed to %lx\n", base); + if (gs == 0 && base == 0xFF) { + printf("[OK]\tGS was reset as expected\n"); } else { - printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : ""); - printf("\n"); + nerrs++; + printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); } } -- cgit v1.2.3-59-g8ed1b From e74bd96989dd42a51a73eddb4a5510a6f5e42ac3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 9 Jul 2019 19:44:03 -0700 Subject: x86/boot: Fix memory leak in default_get_smp_config() When default_get_smp_config() is called with early == 1 and mpf->feature1 is non-zero, mpf is leaked because the return path does not do early_memunmap(). Fix this and share a common exit routine. Fixes: 5997efb96756 ("x86/boot: Use memremap() to map the MPF and MPC data") Reported-by: Cfir Cohen Signed-off-by: David Rientjes Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907091942570.28240@chino.kir.corp.google.com --- arch/x86/kernel/mpparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 1bfe5c6e6cfe..afac7ccce72f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -546,17 +546,15 @@ void __init default_get_smp_config(unsigned int early) * local APIC has default address */ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - return; + goto out; } pr_info("Default MP configuration #%d\n", mpf->feature1); construct_default_ISA_mptable(mpf->feature1); } else if (mpf->physptr) { - if (check_physptr(mpf, early)) { - early_memunmap(mpf, sizeof(*mpf)); - return; - } + if (check_physptr(mpf, early)) + goto out; } else BUG(); @@ -565,7 +563,7 @@ void __init default_get_smp_config(unsigned int early) /* * Only use the first configuration found. */ - +out: early_memunmap(mpf, sizeof(*mpf)); } -- cgit v1.2.3-59-g8ed1b From ffdb07f31252625b7bcbf1f424d7beccff02ba97 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 10 Jul 2019 13:19:35 -0700 Subject: x86/mm: Free sme_early_buffer after init The contents of sme_early_buffer should be cleared after __sme_early_enc_dec() because it is used to move encrypted and decrypted data, but since __sme_early_enc_dec() is __init this buffer simply can be freed after init. This saves a page that is otherwise unreferenced after init. Reported-by: Cfir Cohen Signed-off-by: David Rientjes Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907101318170.197432@chino.kir.corp.google.com --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index e0df96fdfe46..e94e0a62ba92 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(sev_enable_key); bool sev_enabled __section(.data); /* Buffer used for early in-place encryption by BSP, no locking needed */ -static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); +static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); /* * This routine does not change the underlying encryption setting of the -- cgit v1.2.3-59-g8ed1b From ec6335586953b0df32f83ef696002063090c7aef Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 8 Jul 2019 17:36:45 -0400 Subject: x86/apic: Silence -Wtype-limits compiler warnings There are many compiler warnings like this, In file included from ./arch/x86/include/asm/smp.h:13, from ./arch/x86/include/asm/mmzone_64.h:11, from ./arch/x86/include/asm/mmzone.h:5, from ./include/linux/mmzone.h:969, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from arch/x86/kernel/apic/io_apic.c:34: arch/x86/kernel/apic/io_apic.c: In function 'check_timer': ./arch/x86/include/asm/apic.h:37:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] if ((v) <= apic_verbosity) \ ^~ arch/x86/kernel/apic/io_apic.c:2160:2: note: in expansion of macro 'apic_printk' apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " ^~~~~~~~~~~ ./arch/x86/include/asm/apic.h:37:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] if ((v) <= apic_verbosity) \ ^~ arch/x86/kernel/apic/io_apic.c:2207:4: note: in expansion of macro 'apic_printk' apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " ^~~~~~~~~~~ APIC_QUIET is 0, so silence them by making apic_verbosity type int. Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1562621805-24789-1-git-send-email-cai@lca.pw --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 050e5f9ebf81..e647aa095867 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -49,7 +49,7 @@ static inline void generic_apic_probe(void) #ifdef CONFIG_X86_LOCAL_APIC -extern unsigned int apic_verbosity; +extern int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1bd91cb7b320..f5291362da1a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -unsigned int apic_verbosity; +int apic_verbosity; int pic_mode; -- cgit v1.2.3-59-g8ed1b From f709f81483d652b4ae5bbda2204b95593ce07c8f Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 15 Jul 2019 10:47:09 +0800 Subject: x86/e820: Use proper booleans instead of 0/1 This fixes the following coccinelle warning: ./arch/x86/kernel/e820.c:89:9-10: WARNING: return of 0/1 in function '_e820__mapped_any' with return type bool Return type bool instead of 0/1. Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563158829-44373-1-git-send-email-wang.yi59@zte.com.cn --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e69408bf664b..7da2bcd2b8eb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -86,9 +86,9 @@ static bool _e820__mapped_any(struct e820_table *table, continue; if (entry->addr >= end || entry->addr + entry->size <= start) continue; - return 1; + return true; } - return 0; + return false; } bool e820__mapped_raw_any(u64 start, u64 end, enum e820_type type) -- cgit v1.2.3-59-g8ed1b From 29e7e9664aec17b94a9c8c5a75f8d216a206aa3a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 12 Jul 2019 11:08:05 +0200 Subject: x86: math-emu: Hide clang warnings for 16-bit overflow clang warns about a few parts of the math-emu implementation where a 16-bit integer becomes negative during assignment: arch/x86/math-emu/poly_tan.c:88:35: error: implicit conversion from 'int' to 'short' changes value from 49216 to -16320 [-Werror,-Wconstant-conversion] (0x41 + EXTENDED_Ebias) | SIGN_Negative); ~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~ arch/x86/math-emu/fpu_emu.h:180:58: note: expanded from macro 'setexponent16' #define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } ~ ^ arch/x86/math-emu/reg_constant.c:37:32: error: implicit conversion from 'int' to 'short' changes value from 49085 to -16451 [-Werror,-Wconstant-conversion] FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, ^~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:21:25: note: expanded from macro 'MAKE_REG' ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:48:28: error: implicit conversion from 'int' to 'short' changes value from 65535 to -1 [-Werror,-Wconstant-conversion] FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/math-emu/reg_constant.c:21:25: note: expanded from macro 'MAKE_REG' ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ The code is correct as is, so add a typecast to shut up the warnings. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190712090816.350668-1-arnd@arndb.de --- arch/x86/math-emu/fpu_emu.h | 2 +- arch/x86/math-emu/reg_constant.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index a5a41ec58072..0c122226ca56 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -177,7 +177,7 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y) #define setexponentpos(x,y) { (*(short *)&((x)->exp)) = \ ((y) + EXTENDED_Ebias) & 0x7fff; } #define exponent16(x) (*(short *)&((x)->exp)) -#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (y); } +#define setexponent16(x,y) { (*(short *)&((x)->exp)) = (u16)(y); } #define addexponent(x,y) { (*(short *)&((x)->exp)) += (y); } #define stdexp(x) { (*(short *)&((x)->exp)) += EXTENDED_Ebias; } diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index 8dc9095bab22..742619e94bdf 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -18,7 +18,7 @@ #include "control_w.h" #define MAKE_REG(s, e, l, h) { l, h, \ - ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } + (u16)((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 -- cgit v1.2.3-59-g8ed1b From 50e04acf2990d0d93983720b0a85b11ef805df60 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 13 Jul 2019 00:41:52 +0200 Subject: x86/process: Delete useless check for dead process with LDT At release_thread(), ->mm is NULL; and it is fine for the former mm to still have an LDT. Delete this check in process_64.c, similar to commit 2684927c6b93 ("[PATCH] x86: Deprecate useless bug"), which did the same in process_32.c. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190712224152.13129-1-jannh@google.com --- arch/x86/kernel/process_64.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 250e4c4ac6d9..af64519b2695 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -143,17 +143,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) void release_thread(struct task_struct *dead_task) { - if (dead_task->mm) { -#ifdef CONFIG_MODIFY_LDT_SYSCALL - if (dead_task->mm->context.ldt) { - pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt->entries, - dead_task->mm->context.ldt->nr_entries); - BUG(); - } -#endif - } + WARN_ON(dead_task->mm); } enum which_selector { -- cgit v1.2.3-59-g8ed1b From 406de552c2be6ded524c75d14a73cf7f027f587e Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Thu, 28 Mar 2019 12:06:37 +0000 Subject: MAINTAINERS: Update PARAVIRT_OPS_INTERFACE and VMWARE_HYPERVISOR_INTERFACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alok Kataria will be handing over VMware's maintainership of these interfaces to Thomas Hellström, with pv-drivers as backup contact. Signed-off-by: Thomas Hellstrom Signed-off-by: Thomas Gleixner Acked-by: Alok Kataria Acked-by: Juergen Gross Link: https://lkml.kernel.org/r/20190328120558.29897-1-thellstrom@vmware.com --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f5533d1bda2e..80fa7a4a0b56 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12074,7 +12074,8 @@ F: Documentation/parport*.txt PARAVIRT_OPS INTERFACE M: Juergen Gross -M: Alok Kataria +M: Thomas Hellstrom +M: "VMware, Inc." L: virtualization@lists.linux-foundation.org S: Supported F: Documentation/virtual/paravirt_ops.txt @@ -17087,7 +17088,8 @@ S: Maintained F: drivers/misc/vmw_balloon.c VMWARE HYPERVISOR INTERFACE -M: Alok Kataria +M: Thomas Hellstrom +M: "VMware, Inc." L: virtualization@lists.linux-foundation.org S: Supported F: arch/x86/kernel/cpu/vmware.c -- cgit v1.2.3-59-g8ed1b From 55aedddb6149ab71bec9f050846855113977b033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:55 +0200 Subject: x86/paravirt: Make read_cr2() CALLEE_SAVE The one paravirt read_cr2() implementation (Xen) is actually quite trivial and doesn't need to clobber anything other than the return register. Making read_cr2() CALLEE_SAVE avoids all the PUSH/POP nonsense and allows more convenient use from assembly. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: bp@alien8.de Cc: rostedt@goodmis.org Cc: luto@kernel.org Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114335.887392493@infradead.org --- arch/x86/entry/calling.h | 6 ++++++ arch/x86/include/asm/paravirt.h | 22 +++++++++++++--------- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/head_64.S | 4 +--- arch/x86/kernel/paravirt.c | 2 +- arch/x86/xen/enlighten_pv.c | 3 ++- arch/x86/xen/mmu_pv.c | 12 +----------- arch/x86/xen/xen-asm.S | 16 ++++++++++++++++ arch/x86/xen/xen-ops.h | 3 +++ 10 files changed, 45 insertions(+), 26 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1f9e3b8230..830bd984182b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -343,3 +343,9 @@ For 32-bit we have the following conventions - kernel is built with .Lafter_call_\@: #endif .endm + +#ifdef CONFIG_PARAVIRT_XXL +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg +#else +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg +#endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c25c38a05c1c..5135282683d4 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -116,7 +116,7 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr2); + return PVOP_CALLEE0(unsigned long, mmu.read_cr2); } static inline void write_cr2(unsigned long x) @@ -909,13 +909,7 @@ extern void default_banner(void); ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ ) -#endif - -#define GET_CR2_INTO_RAX \ - ANNOTATE_RETPOLINE_SAFE; \ - call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); -#ifdef CONFIG_PARAVIRT_XXL #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ ANNOTATE_RETPOLINE_SAFE; \ @@ -929,9 +923,19 @@ extern void default_banner(void); call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif -#endif +#endif /* CONFIG_PARAVIRT_XXL */ +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_PARAVIRT_XXL + +#define GET_CR2_INTO_AX \ + PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ + ) + +#endif /* CONFIG_PARAVIRT_XXL */ -#endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 946f8f1f1efc..639b2df445ee 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -220,7 +220,7 @@ struct pv_mmu_ops { void (*exit_mmap)(struct mm_struct *mm); #ifdef CONFIG_PARAVIRT_XXL - unsigned long (*read_cr2)(void); + struct paravirt_callee_save read_cr2; void (*write_cr2)(unsigned long); unsigned long (*read_cr3)(void); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index da64452584b0..5c7ee3df4d0b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -76,6 +76,7 @@ static void __used common(void) BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); + OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif BLANK(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index bcd206c8ac90..0e2d72929a8c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -29,9 +29,7 @@ #ifdef CONFIG_PARAVIRT_XXL #include #include -#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg #else -#define GET_CR2_INTO(reg) movq %cr2, reg #define INTERRUPT_RETURN iretq #endif @@ -323,7 +321,7 @@ early_idt_handler_common: cmpq $14,%rsi /* Page fault? */ jnz 10f - GET_CR2_INTO(%rdi) /* Can clobber any volatile register if pv */ + GET_CR2_INTO(%rdi) /* can clobber %rax if pv */ call early_make_pgtable andl %eax,%eax jz 20f /* All good */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 98039d7fb998..0aa6256eedd8 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -370,7 +370,7 @@ struct paravirt_patch_template pv_ops = { .mmu.exit_mmap = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL - .mmu.read_cr2 = native_read_cr2, + .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(native_read_cr2), .mmu.write_cr2 = native_write_cr2, .mmu.read_cr3 = __native_read_cr3, .mmu.write_cr3 = native_write_cr3, diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 4722ba2966ac..26b63d051bda 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -998,7 +998,8 @@ void __init xen_setup_vcpu_info_placement(void) __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); pv_ops.irq.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); - pv_ops.mmu.read_cr2 = xen_read_cr2_direct; + pv_ops.mmu.read_cr2 = + __PV_IS_CALLEE_SAVE(xen_read_cr2_direct); } } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f6e5eeecfc69..26e8b326966d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1307,16 +1307,6 @@ static void xen_write_cr2(unsigned long cr2) this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } -static unsigned long xen_read_cr2(void) -{ - return this_cpu_read(xen_vcpu)->arch.cr2; -} - -unsigned long xen_read_cr2_direct(void) -{ - return this_cpu_read(xen_vcpu_info.arch.cr2); -} - static noinline void xen_flush_tlb(void) { struct mmuext_op *op; @@ -2397,7 +2387,7 @@ static void xen_leave_lazy_mmu(void) } static const struct pv_mmu_ops xen_mmu_ops __initconst = { - .read_cr2 = xen_read_cr2, + .read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2), .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 8019edd0125c..be104eef80be 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -135,3 +136,18 @@ ENTRY(check_events) FRAME_END ret ENDPROC(check_events) + +ENTRY(xen_read_cr2) + FRAME_BEGIN + _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX + _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX + FRAME_END + ret + ENDPROC(xen_read_cr2); + +ENTRY(xen_read_cr2_direct) + FRAME_BEGIN + _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX + FRAME_END + ret + ENDPROC(xen_read_cr2_direct); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2f111f47ba98..45a441c33d6d 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -134,6 +134,9 @@ __visible void xen_irq_disable_direct(void); __visible unsigned long xen_save_fl_direct(void); __visible void xen_restore_fl_direct(unsigned long); +__visible unsigned long xen_read_cr2(void); +__visible unsigned long xen_read_cr2_direct(void); + /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); __visible void xen_sysret32(void); -- cgit v1.2.3-59-g8ed1b From e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:56 +0200 Subject: x86/entry/32: Simplify common_exception Adding one more option to SAVE_ALL can be used in common_exception to simplify things. This also saves duplication later where page_fault will no longer use common_exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114335.945136187@infradead.org --- arch/x86/entry/entry_32.S | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 90b473297299..4d4b6100f0e8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -294,9 +294,11 @@ .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 cld +.if \skip_gs == 0 PUSH_GS +.endif FIXUP_FRAME pushl %fs pushl %es @@ -313,13 +315,13 @@ movl %edx, %es movl $(__KERNEL_PERCPU), %edx movl %edx, %fs +.if \skip_gs == 0 SET_KERNEL_GS %edx - +.endif /* Switch to kernel stack if necessary */ .if \switch_stacks > 0 SWITCH_TO_KERNEL_STACK .endif - .endm .macro SAVE_ALL_NMI cr3_reg:req @@ -1448,32 +1450,20 @@ END(page_fault) common_exception: /* the function address is in %gs's slot on the stack */ - FIXUP_FRAME - pushl %fs - pushl %es - pushl %ds - pushl %eax - movl $(__USER_DS), %eax - movl %eax, %ds - movl %eax, %es - movl $(__KERNEL_PERCPU), %eax - movl %eax, %fs - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - SWITCH_TO_KERNEL_STACK + SAVE_ALL switch_stacks=1 skip_gs=1 ENCODE_FRAME_POINTER - cld UNWIND_ESPFIX_STACK + + /* fixup %gs */ GS_TO_REG %ecx movl PT_GS(%esp), %edi # get the function address - movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi -- cgit v1.2.3-59-g8ed1b From 2fd37912cfb019228bf246215938e6f7619516a2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:57 +0200 Subject: x86/entry/64: Simplify idtentry a little There's a bunch of duplication in idtentry, namely the .Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow. Make this explicit by creating a idtentry_part helper macro. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114336.002429503@infradead.org --- arch/x86/entry/entry_64.S | 102 ++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0ea4831a72a4..3db5fede743b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,6 +864,52 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) +.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 + + .if \paranoid + call paranoid_entry + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + .else + call error_entry + .endif + UNWIND_HINT_REGS + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + .if \paranoid + /* this procedure expect "no swapgs" flag in ebx */ + jmp paranoid_exit + .else + jmp error_exit + .endif + +.endm + /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point @@ -934,47 +980,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - .if \paranoid - call paranoid_entry - .else - call error_entry - .endif - UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif + idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -983,21 +989,9 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ + idtentry_part \do_sym, \has_error_code, paranoid=0 .endif - call \do_sym - - jmp error_exit - .endif _ASM_NOKPROBE(\sym) END(\sym) .endm -- cgit v1.2.3-59-g8ed1b From 4234653e882740cbf6625eeee294e388b3176583 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:58 +0200 Subject: x86/entry/64: Update comments and sanity tests for create_gap Commit 2700fefdb2d9 ("x86_64: Add gap to int3 to allow for call emulation") forgot to update the comment, do so now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: bp@alien8.de Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: zhe.he@windriver.com Cc: joel@joelfernandes.org Cc: devel@etsukata.com Link: https://lkml.kernel.org/r/20190711114336.059780563@infradead.org --- arch/x86/entry/entry_64.S | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3db5fede743b..95ae05f0edf2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -913,15 +913,16 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from * kernel mode with user GSBASE and/or user CR3. * 2 is special -- see below. * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a + * decrement the IST stack so that nested entries get a * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) + * of recursing.) + * @create_gap: create a 6-word stack gap when coming from kernel mode. * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -951,10 +952,14 @@ ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 + .if \shift_ist != -1 && \paranoid != 1 .error "using shift_ist requires paranoid=1" .endif + .if \create_gap && \paranoid + .error "using create_gap requires paranoid=0" + .endif + ASM_CLAC .if \has_error_code == 0 -- cgit v1.2.3-59-g8ed1b From a0d14b8909de55139b8702fe0c7e80b69763dcfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jul 2019 13:40:59 +0200 Subject: x86/mm, tracing: Fix CR2 corruption Despite the current efforts to read CR2 before tracing happens there still exist a number of possible holes: idtentry page_fault do_page_fault has_error_code=1 call error_entry TRACE_IRQS_OFF call trace_hardirqs_off* #PF // modifies CR2 CALL_enter_from_user_mode __context_tracking_exit() trace_user_exit(0) #PF // modifies CR2 call do_page_fault address = read_cr2(); /* whoopsie */ And similar for i386. Fix it by pulling the CR2 read into the entry code, before any of that stuff gets a chance to run and ruin things. Reported-by: He Zhe Reported-by: Eiichi Tsukata Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: rostedt@goodmis.org Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: joel@joelfernandes.org Link: https://lkml.kernel.org/r/20190711114336.116812491@infradead.org Debugged-by: Steven Rostedt --- arch/x86/entry/entry_32.S | 25 ++++++++++++++++++++++--- arch/x86/entry/entry_64.S | 35 ++++++++++++++++++----------------- arch/x86/include/asm/kvm_para.h | 2 +- arch/x86/include/asm/traps.h | 4 ++-- arch/x86/kernel/kvm.c | 8 ++++---- arch/x86/kernel/traps.c | 6 +----- arch/x86/mm/fault.c | 30 +++++++++++------------------- 7 files changed, 59 insertions(+), 51 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4d4b6100f0e8..2bb986f305ac 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC - pushl $do_page_fault - ALIGN - jmp common_exception + pushl $0; /* %gs's slot on the stack */ + + SAVE_ALL switch_stacks=1 skip_gs=1 + + ENCODE_FRAME_POINTER + UNWIND_ESPFIX_STACK + + /* fixup %gs */ + GS_TO_REG %ecx + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + + GET_CR2_INTO(%ecx) # might clobber %eax + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + call do_page_fault + jmp ret_from_exception END(page_fault) common_exception: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 95ae05f0edf2..7cb2e1f1ec09 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) -.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 +.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 .if \paranoid call paranoid_entry @@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .endif UNWIND_HINT_REGS - .if \paranoid + .if \read_cr2 + GET_CR2_INTO(%rdx); /* can clobber %rax */ + .endif + .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif + + .if \paranoid == 0 + testb $3, CS(%rsp) + jz .Lfrom_kernel_no_context_tracking_\@ + CALL_enter_from_user_mode +.Lfrom_kernel_no_context_tracking_\@: .endif movq %rsp, %rdi /* pt_regs pointer */ @@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * fresh stack. (This is for #DB, which has a nasty habit * of recursing.) * @create_gap: create a 6-word stack gap when coming from kernel mode. + * @read_cr2: load CR2 into the 3rd argument; done before calling any C code * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -985,7 +995,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset + idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -994,7 +1004,7 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - idtentry_part \do_sym, \has_error_code, paranoid=0 + idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 .endif _ASM_NOKPROBE(\sym) @@ -1006,7 +1016,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1179,10 +1189,10 @@ idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 #endif #ifdef CONFIG_X86_MCE @@ -1281,18 +1291,9 @@ ENTRY(error_entry) movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 - - /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). - */ - TRACE_IRQS_OFF - CALL_enter_from_user_mode ret .Lerror_entry_done: - TRACE_IRQS_OFF ret /* diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 5ed3cf1c3934..9b4df6eaa11a 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -void do_async_page_fault(struct pt_regs *regs, unsigned long error_code); +void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 7d6f3f3fad78..5dd1674ddf4c 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code); #ifdef CONFIG_X86_64 -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code); +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address); asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); void __init trap_init(void); #endif dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code); -dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code); +dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code); dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code); dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 82caf01b63dd..3231440d6253 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { enum ctx_state prev_state; switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code); + do_page_fault(regs, error_code, address); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); + kvm_async_pf_task_wait((u32)address, !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - kvm_async_pf_task_wake((u32)read_cr2()); + kvm_async_pf_task_wake((u32)address); rcu_irq_exit(); break; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 87095a477154..4bb0f8447112 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const char *message, #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) { static const char str[] = "double fault"; struct task_struct *tsk = current; -#ifdef CONFIG_VMAP_STACK - unsigned long cr2; -#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) * stack even if the actual trigger for the double fault was * something else. */ - cr2 = read_cr2(); if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794f364cb882..0799cc79efd3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1507,9 +1507,8 @@ good_area: NOKPROBE_SYMBOL(do_user_addr_fault); /* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. + * Explicitly marked noinline such that the function tracer sees this as the + * page_fault entry point. */ static noinline void __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, @@ -1528,33 +1527,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, } NOKPROBE_SYMBOL(__do_page_fault); -static nokprobe_inline void -trace_page_fault_entries(unsigned long address, struct pt_regs *regs, - unsigned long error_code) +static __always_inline void +trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { + if (!trace_pagefault_enabled()) + return; + if (user_mode(regs)) trace_page_fault_user(address, regs, error_code); else trace_page_fault_kernel(address, regs, error_code); } -/* - * We must have this function blacklisted from kprobes, tagged with notrace - * and call read_cr2() before calling anything else. To avoid calling any - * kind of tracing machinery before we've observed the CR2 value. - * - * exception_{enter,exit}() contains all sorts of tracepoints. - */ -dotraplinkage void notrace -do_page_fault(struct pt_regs *regs, unsigned long error_code) +dotraplinkage void +do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - unsigned long address = read_cr2(); /* Get the faulting address */ enum ctx_state prev_state; prev_state = exception_enter(); - if (trace_pagefault_enabled()) - trace_page_fault_entries(address, regs, error_code); - + trace_page_fault_entries(regs, error_code, address); __do_page_fault(regs, error_code, address); exception_exit(prev_state); } -- cgit v1.2.3-59-g8ed1b From cd6697b8b8751b65abd7859af55cf06f36b8e716 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:15:57 +0800 Subject: x86/boot/efi: Remove unused variables Fix gcc warnings: arch/x86/boot/compressed/eboot.c: In function 'make_boot_params': arch/x86/boot/compressed/eboot.c:394:6: warning: unused variable 'i' [-Wunused-variable] int i; ^ arch/x86/boot/compressed/eboot.c:393:6: warning: unused variable 's1' [-Wunused-variable] u8 *s1; ^ arch/x86/boot/compressed/eboot.c:392:7: warning: unused variable 's2' [-Wunused-variable] u16 *s2; ^ arch/x86/boot/compressed/eboot.c:387:8: warning: unused variable 'options' [-Wunused-variable] void *options, *handle; ^ arch/x86/boot/compressed/eboot.c: In function 'add_e820ext': arch/x86/boot/compressed/eboot.c:498:16: warning: unused variable 'size' [-Wunused-variable] unsigned long size; ^ arch/x86/boot/compressed/eboot.c:497:15: warning: unused variable 'status' [-Wunused-variable] efi_status_t status; ^ arch/x86/boot/compressed/eboot.c: In function 'exit_boot_func': arch/x86/boot/compressed/eboot.c:681:15: warning: unused variable 'status' [-Wunused-variable] efi_status_t status; ^ arch/x86/boot/compressed/eboot.c:680:8: warning: unused variable 'nr_desc' [-Wunused-variable] __u32 nr_desc; ^ arch/x86/boot/compressed/eboot.c: In function 'efi_main': arch/x86/boot/compressed/eboot.c:750:22: warning: unused variable 'image' [-Wunused-variable] efi_loaded_image_t *image; ^ Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563282957-26898-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/eboot.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 220d1279d0e2..d6662fdef300 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -384,14 +384,11 @@ struct boot_params *make_boot_params(struct efi_config *c) struct apm_bios_info *bi; struct setup_header *hdr; efi_loaded_image_t *image; - void *options, *handle; + void *handle; efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; int options_size = 0; efi_status_t status; char *cmdline_ptr; - u16 *s2; - u8 *s1; - int i; unsigned long ramdisk_addr; unsigned long ramdisk_size; @@ -494,8 +491,6 @@ static void add_e820ext(struct boot_params *params, struct setup_data *e820ext, u32 nr_entries) { struct setup_data *data; - efi_status_t status; - unsigned long size; e820ext->type = SETUP_E820_EXT; e820ext->len = nr_entries * sizeof(struct boot_e820_entry); @@ -677,8 +672,6 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, void *priv) { const char *signature; - __u32 nr_desc; - efi_status_t status; struct exit_boot_struct *p = priv; signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE @@ -747,7 +740,6 @@ struct boot_params * efi_main(struct efi_config *c, struct boot_params *boot_params) { struct desc_ptr *gdt = NULL; - efi_loaded_image_t *image; struct setup_header *hdr = &boot_params->hdr; efi_status_t status; struct desc_struct *desc; -- cgit v1.2.3-59-g8ed1b From 449f328637e3ca62461da04d60ccb35aa5aa21dc Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:17:20 +0800 Subject: x86/boot/compressed/64: Remove unused variable Fix gcc warning: arch/x86/boot/compressed/pgtable_64.c: In function 'find_trampoline_placement': arch/x86/boot/compressed/pgtable_64.c:43:16: warning: unused variable 'trampoline_start' [-Wunused-variable] unsigned long trampoline_start; ^ Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Acked-by: Kirill A. Shutemov Link: https://lkml.kernel.org/r/1563283040-31101-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/pgtable_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index f8debf7aeb4c..5f2d03067ae5 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -40,7 +40,6 @@ int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) { unsigned long bios_start = 0, ebda_start = 0; - unsigned long trampoline_start; struct boot_e820_entry *entry; char *signature; int i; -- cgit v1.2.3-59-g8ed1b From 8c5477e8046ca139bac250386c08453da37ec1ae Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 16 Jul 2019 21:18:12 +0800 Subject: x86, boot: Remove multiple copy of static function sanitize_boot_params() Kernel build warns: 'sanitize_boot_params' defined but not used [-Wunused-function] at below files: arch/x86/boot/compressed/cmdline.c arch/x86/boot/compressed/error.c arch/x86/boot/compressed/early_serial_console.c arch/x86/boot/compressed/acpi.c That's becausethey each include misc.h which includes a definition of sanitize_boot_params() via bootparam_utils.h. Remove the inclusion from misc.h and have the c file including bootparam_utils.h directly. Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1563283092-1189-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/boot/compressed/misc.c | 1 + arch/x86/boot/compressed/misc.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 24e65a0f756d..53ac0cb2396d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -17,6 +17,7 @@ #include "pgtable.h" #include "../string.h" #include "../voffset.h" +#include /* * WARNING!! diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index d2f184165934..c8181392f70d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -23,7 +23,6 @@ #include #include #include -#include #define BOOT_CTYPE_H #include -- cgit v1.2.3-59-g8ed1b From e320ab3cec7dd8b1606964d81ae1e14391ff8e96 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Fri, 19 Jul 2019 03:22:35 +0000 Subject: x86/hyper-v: Zero out the VP ASSIST PAGE on allocation The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section 5.2.1 "GPA Overlay Pages" for the details) and here is an excerpt: "The hypervisor defines several special pages that "overlay" the guest's Guest Physical Addresses (GPA) space. Overlays are addressed GPA but are not included in the normal GPA map maintained internally by the hypervisor. Conceptually, they exist in a separate map that overlays the GPA map. If a page within the GPA space is overlaid, any SPA page mapped to the GPA page is effectively "obscured" and generally unreachable by the virtual processor through processor memory accesses. If an overlay page is disabled, the underlying GPA page is "uncovered", and an existing mapping becomes accessible to the guest." SPA = System Physical Address = the final real physical address. When a CPU (e.g. CPU1) is onlined, hv_cpu_init() allocates the VP ASSIST PAGE and enables the EOI optimization for this CPU by writing the MSR HV_X64_MSR_VP_ASSIST_PAGE. From now on, hvp->apic_assist belongs to the special SPA page, and this CPU *always* uses hvp->apic_assist (which is shared with the hypervisor) to decide if it needs to write the EOI MSR. When a CPU is offlined then on the outgoing CPU: 1. hv_cpu_die() disables the EOI optimizaton for this CPU, and from now on hvp->apic_assist belongs to the original "normal" SPA page; 2. the remaining work of stopping this CPU is done 3. this CPU is completely stopped. Between 1 and 3, this CPU can still receive interrupts (e.g. reschedule IPIs from CPU0, and Local APIC timer interrupts), and this CPU *must* write the EOI MSR for every interrupt received, otherwise the hypervisor may not deliver further interrupts, which may be needed to completely stop the CPU. So, after the EOI optimization is disabled in hv_cpu_die(), it's required that the hvp->apic_assist's bit0 is zero, which is not guaranteed by the current allocation mode because it lacks __GFP_ZERO. As a consequence the bit might be set and interrupt handling would not write the EOI MSR causing interrupt delivery to become stuck. Add the missing __GFP_ZERO to the allocation. Note 1: after the "normal" SPA page is allocted and zeroed out, neither the hypervisor nor the guest writes into the page, so the page remains with zeros. Note 2: see Section 10.3.5 "EOI Assist" for the details of the EOI optimization. When the optimization is enabled, the guest can still write the EOI MSR register irrespective of the "No EOI required" value, but that's slower than the optimized assist based variant. Fixes: ba696429d290 ("x86/hyper-v: Implement EOI assist") Signed-off-by: Dexuan Cui Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/ Date: Sat, 20 Jul 2019 10:56:41 +0200 Subject: x86/entry/64: Prevent clobbering of saved CR2 value The recent fix for CR2 corruption introduced a new way to reliably corrupt the saved CR2 value. CR2 is saved early in the entry code in RDX, which is the third argument to the fault handling functions. But it missed that between saving and invoking the fault handler enter_from_user_mode() can be called. RDX is a caller saved register so the invoked function can freely clobber it with the obvious consequences. The TRACE_IRQS_OFF call is safe as it calls through the thunk which preserves RDX, but TRACE_IRQS_OFF_DEBUG is not because it also calls into C-code outside of the thunk. Store CR2 in R12 instead which is a callee saved register and move R12 to RDX just before calling the fault handler. Fixes: a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption") Reported-by: Sean Christopherson Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907201020540.1782@nanos.tec.linutronix.de --- arch/x86/entry/entry_64.S | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7cb2e1f1ec09..f7c70c1bee8b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -875,7 +875,12 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt UNWIND_HINT_REGS .if \read_cr2 - GET_CR2_INTO(%rdx); /* can clobber %rax */ + /* + * Store CR2 early so subsequent faults cannot clobber it. Use R12 as + * intermediate storage as RDX can be clobbered in enter_from_user_mode(). + * GET_CR2_INTO can clobber RAX. + */ + GET_CR2_INTO(%r12); .endif .if \shift_ist != -1 @@ -904,6 +909,10 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif + .if \read_cr2 + movq %r12, %rdx /* Move CR2 into 3rd argument */ + .endif + call \do_sym .if \shift_ist != -1 -- cgit v1.2.3-59-g8ed1b