From 8f34c5b5afce91d171bb0802631197484cb69b8b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:45 +0200 Subject: x86/exceptions: Make IST index zero based The defines for the exception stack (IST) array in the TSS are using the SDM convention IST1 - IST7. That causes all sorts of code to subtract 1 for array indices related to IST. That's confusing at best and does not provide any value. Make the indices zero based and fixup the usage sites. The only code which needs to adjust the 0 based index is the interrupt descriptor setup which needs to add 1 now. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: linux-doc@vger.kernel.org Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.331772825@linutronix.de --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..0e4cb718fc4a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -516,7 +516,7 @@ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); */ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ + [ESTACK_DB] = DEBUG_STKSZ }; #endif @@ -1760,7 +1760,7 @@ void cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; - if (v == DEBUG_STACK-1) + if (v == ESTACK_DB) per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } -- cgit v1.2.3-59-g8ed1b From 019b17b3ffe48100e52f609ca1c6ed6e5a40cba1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:47 +0200 Subject: x86/exceptions: Add structs for exception stacks At the moment everything assumes a full linear mapping of the various exception stacks. Adding guard pages to the cpu entry area mapping of the exception stacks will break that assumption. As a preparatory step convert both the real storage and the effective mapping in the cpu entry area from character arrays to structures. To ensure that both arrays have the same ordering and the same size of the individual stacks fill the members with a macro. The guard size is the only difference between the two resulting structures. For now both have guard size 0 until the preparation of all usage sites is done. Provide a couple of helper macros which are used in the following conversions. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160144.506807893@linutronix.de --- arch/x86/include/asm/cpu_entry_area.h | 52 +++++++++++++++++++++++++++++++---- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/mm/cpu_entry_area.c | 8 ++---- 3 files changed, 51 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c706415443..af8c312673de 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,51 @@ #include #include +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize) \ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[DEBUG_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0) +}; + +/* + * The effective cpu entry area mapping with guard pages. Guard size is + * zero until the code which makes assumptions about linear mappings is + * cleaned up. + */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(0) +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +77,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0e4cb718fc4a..24b801ea7522 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1754,7 +1754,7 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; + char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index c2a54f75d335..6a09b84c13fe 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -138,9 +137,8 @@ static void __init setup_cpu_entry_area(unsigned int cpu) #ifdef CONFIG_X86_64 BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&cea->exception_stacks, - &per_cpu(exception_stacks, cpu), + sizeof(((struct cpu_entry_area *)0)->estacks)); + cea_map_percpu_pages(&cea->estacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); #endif percpu_setup_debug_store(cpu); -- cgit v1.2.3-59-g8ed1b From f6ef73224a0f0400c3979c8bc68b383f9d2eb9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:53 +0200 Subject: x86/cpu: Prepare TSS.IST setup for guard pages Convert the TSS.IST setup code to use the cpu entry area information directly instead of assuming a linear mapping of the IST stacks. The store to orig_ist[] is no longer required as there are no users anymore. This is the last preparatory step towards IST guard pages. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.061686012@linutronix.de --- arch/x86/kernel/cpu/common.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 24b801ea7522..4b01b71415f5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -507,19 +507,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [ESTACK_DB] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1690,17 +1677,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1699,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1736,12 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = (char *)&get_cpu_entry_area(cpu)->estacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == ESTACK_DB) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; -- cgit v1.2.3-59-g8ed1b From 4d68c3d0ecd5fcba8876e8a58ac41ffb360de43e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:54 +0200 Subject: x86/cpu: Remove orig_ist array All users gone. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Pingfan Liu Cc: Pu Wen Cc: Sean Christopherson Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.151435667@linutronix.de --- arch/x86/include/asm/processor.h | 9 --------- arch/x86/kernel/cpu/common.c | 6 ------ 2 files changed, 15 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12..8fcfcd1a8375 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,16 +374,7 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - union irq_stack_union { char irq_stack[IRQ_STACK_SIZE]; /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b01b71415f5..8243f198fb7f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1549,12 +1549,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); -- cgit v1.2.3-59-g8ed1b From 3207426925d2b4da390be8068df1d1c2b36e5918 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:55 +0200 Subject: x86/exceptions: Disconnect IST index and stack order The entry order of the TSS.IST array and the order of the stack storage/mapping are not required to be the same. With the upcoming split of the debug stack this is going to fall apart as the number of TSS.IST array entries stays the same while the actual stacks are increasing. Make them separate so that code like dumpstack can just utilize the mapping order. The IST index is solely required for the actual TSS.IST array initialization. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Dou Liyang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jann Horn Cc: Josh Poimboeuf Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.241588113@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/cpu_entry_area.h | 11 +++++++++++ arch/x86/include/asm/page_64_types.h | 9 ++++----- arch/x86/include/asm/stacktrace.h | 2 ++ arch/x86/kernel/cpu/common.c | 10 +++++----- arch/x86/kernel/idt.c | 8 ++++---- 6 files changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index fd0a50452cb3..5c0348504a4b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=ESTACK_DB +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 9b406f067ecf..310eeb62d418 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -35,6 +35,17 @@ struct cea_exception_stacks { ESTACKS_MEMBERS(0) }; +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + #define CEA_ESTACK_SIZE(st) \ sizeof(((struct cea_exception_stacks *)0)->st## _stack) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6ab2c54c1bf9..056de887b220 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -27,11 +27,10 @@ /* * The index for the tss.ist[] array. The hardware limit is 7 entries. */ -#define ESTACK_DF 0 -#define ESTACK_NMI 1 -#define ESTACK_DB 2 -#define ESTACK_MCE 3 -#define N_EXCEPTION_STACKS 4 +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f335aad404a4..d6d758a187b6 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include #include + +#include #include enum stack_type { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8243f198fb7f..143aceaf9a9a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1731,11 +1731,11 @@ void cpu_init(void) * set up and load the per-CPU TSS */ if (!t->x86_tss.ist[0]) { - t->x86_tss.ist[ESTACK_DF] = __this_cpu_ist_top_va(DF); - t->x86_tss.ist[ESTACK_NMI] = __this_cpu_ist_top_va(NMI); - t->x86_tss.ist[ESTACK_DB] = __this_cpu_ist_top_va(DB); - t->x86_tss.ist[ESTACK_MCE] = __this_cpu_ist_top_va(MCE); - per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[ESTACK_DB]; + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); + per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2188f734ec61..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -183,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, ESTACK_DB), - ISTG(X86_TRAP_NMI, nmi, ESTACK_NMI), - ISTG(X86_TRAP_DF, double_fault, ESTACK_DF), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, ESTACK_MCE), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; -- cgit v1.2.3-59-g8ed1b From 2a594d4ccf3f10f80b77d71bd3dad10813ac0137 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 17:59:57 +0200 Subject: x86/exceptions: Split debug IST stack The debug IST stack is actually two separate debug stacks to handle #DB recursion. This is required because the CPU starts always at top of stack on exception entry, which means on #DB recursion the second #DB would overwrite the stack of the first. The low level entry code therefore adjusts the top of stack on entry so a secondary #DB starts from a different stack page. But the stack pages are adjacent without a guard page between them. Split the debug stack into 3 stacks which are separated by guard pages. The 3rd stack is never mapped into the cpu_entry_area and is only there to catch triple #DB nesting: --- top of DB_stack <- Initial stack --- end of DB_stack guard page --- top of DB1_stack <- Top of stack after entering first #DB --- end of DB1_stack guard page --- top of DB2_stack <- Top of stack after entering second #DB --- end of DB2_stack guard page If DB2 would not act as the final guard hole, a second #DB would point the top of #DB stack to the stack below #DB1 which would be valid and not catch the not so desired triple nesting. The backing store does not allocate any memory for DB2 and its guard page as it is not going to be mapped into the cpu_entry_area. - Adjust the low level entry code so it adjusts top of #DB with the offset between the stacks instead of exception stack size. - Make the dumpstack code aware of the new stacks. - Adjust the in_debug_stack() implementation and move it into the NMI code where it belongs. As this is NMI hotpath code, it just checks the full area between top of DB_stack and bottom of DB1_stack without checking for the guard page. That's correct because the NMI cannot hit a stackpointer pointing to the guard page between DB and DB1 stack. Even if it would, then the NMI operation still is unaffected, but the resume of the debug exception on the topmost DB stack will crash by touching the guard page. [ bp: Make exception_stack_names static const char * const ] Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Sean Christopherson Cc: Andy Lutomirski Cc: Baoquan He Cc: "Chang S. Bae" Cc: Dave Hansen Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: "Kirill A. Shutemov" Cc: Konrad Rzeszutek Wilk Cc: linux-doc@vger.kernel.org Cc: Masahiro Yamada Cc: Peter Zijlstra Cc: Qian Cai Cc: Sean Christopherson Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.439944544@linutronix.de --- Documentation/x86/kernel-stacks | 7 ++++++- arch/x86/entry/entry_64.S | 8 ++++---- arch/x86/include/asm/cpu_entry_area.h | 14 ++++++++++---- arch/x86/include/asm/debugreg.h | 2 -- arch/x86/include/asm/page_64_types.h | 3 --- arch/x86/kernel/asm-offsets_64.c | 2 ++ arch/x86/kernel/cpu/common.c | 11 ----------- arch/x86/kernel/dumpstack_64.c | 12 ++++++++---- arch/x86/kernel/nmi.c | 20 +++++++++++++++++++- arch/x86/mm/cpu_entry_area.c | 4 +++- 10 files changed, 52 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/Documentation/x86/kernel-stacks b/Documentation/x86/kernel-stacks index 1b04596caea9..d1bfb0b95ee0 100644 --- a/Documentation/x86/kernel-stacks +++ b/Documentation/x86/kernel-stacks @@ -76,7 +76,7 @@ The currently assigned IST stacks are :- middle of switching stacks. Using IST for NMI events avoids making assumptions about the previous state of the kernel stack. -* ESTACK_DB. DEBUG_STKSZ +* ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE). Used for hardware debug interrupts (interrupt 1) and for software debug interrupts (INT3). @@ -86,6 +86,11 @@ The currently assigned IST stacks are :- avoids making assumptions about the previous state of the kernel stack. + To handle nested #DB correctly there exist two instances of DB stacks. On + #DB entry the IST stackpointer for #DB is switched to the second instance + so a nested #DB starts from a clean stack. The nested #DB switches + the IST stackpointer to a guard hole to catch triple nesting. + * ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE). Used for interrupt 18 - Machine Check Exception (#MC). diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 5c0348504a4b..ee649f1f279e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -879,7 +879,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -925,13 +925,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ @@ -1129,7 +1129,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 9c96406e6d2b..cff3f3f3bfe0 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -10,25 +10,29 @@ #ifdef CONFIG_X86_64 /* Macro to enforce the same ordering and stack sizes */ -#define ESTACKS_MEMBERS(guardsize) \ +#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ char DF_stack_guard[guardsize]; \ char DF_stack[EXCEPTION_STKSZ]; \ char NMI_stack_guard[guardsize]; \ char NMI_stack[EXCEPTION_STKSZ]; \ + char DB2_stack_guard[guardsize]; \ + char DB2_stack[db2_holesize]; \ + char DB1_stack_guard[guardsize]; \ + char DB1_stack[EXCEPTION_STKSZ]; \ char DB_stack_guard[guardsize]; \ - char DB_stack[DEBUG_STKSZ]; \ + char DB_stack[EXCEPTION_STKSZ]; \ char MCE_stack_guard[guardsize]; \ char MCE_stack[EXCEPTION_STKSZ]; \ char IST_top_guard[guardsize]; \ /* The exception stacks' physical storage. No guard pages required */ struct exception_stacks { - ESTACKS_MEMBERS(0) + ESTACKS_MEMBERS(0, 0) }; /* The effective cpu entry area mapping with guard pages. */ struct cea_exception_stacks { - ESTACKS_MEMBERS(PAGE_SIZE) + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) }; /* @@ -37,6 +41,8 @@ struct cea_exception_stacks { enum exception_stack_ordering { ESTACK_DF, ESTACK_NMI, + ESTACK_DB2, + ESTACK_DB1, ESTACK_DB, ESTACK_MCE, N_EXCEPTION_STACKS diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9e5ca30738e5..1a8609a15856 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) { __this_cpu_dec(debug_stack_usage); } -int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); void debug_stack_reset(void); #else /* !X86_64 */ -static inline int is_debug_stack(unsigned long addr) { return 0; } static inline void debug_stack_set_zero(void) { } static inline void debug_stack_reset(void) { } static inline void debug_stack_usage_inc(void) { } diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 056de887b220..793c14c372cb 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -18,9 +18,6 @@ #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) -#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) - #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..f5281567e28e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,6 +68,8 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 143aceaf9a9a..88cab45707a9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1549,17 +1549,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1735,7 +1725,6 @@ void cpu_init(void) t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); - per_cpu(debug_stack_addr, cpu) = t->x86_tss.ist[IST_INDEX_DB]; } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f6fbd0438f9e..fca97bd3d8ae 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -19,16 +19,18 @@ #include #include -static const char *exception_stack_names[N_EXCEPTION_STACKS] = { +static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -58,9 +60,11 @@ struct estack_layout { .end = offsetof(struct cea_exception_stacks, x## _stack_guard) \ } -static const struct estack_layout layout[N_EXCEPTION_STACKS] = { +static const struct estack_layout layout[] = { [ ESTACK_DF ] = ESTACK_ENTRY(DF), [ ESTACK_NMI ] = ESTACK_ENTRY(NMI), + [ ESTACK_DB2 ] = { .begin = 0, .end = 0}, + [ ESTACK_DB1 ] = ESTACK_ENTRY(DB1), [ ESTACK_DB ] = ESTACK_ENTRY(DB), [ ESTACK_MCE ] = ESTACK_ENTRY(MCE), }; @@ -71,7 +75,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); estacks = (unsigned long)__this_cpu_read(cea_exception_stacks); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include #include #include +#include #include #if defined(CONFIG_EDAC) #include #endif -#include +#include #include #include #include @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index a00d0d059c8a..752ad11d6868 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -98,10 +98,12 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) /* * The exceptions stack mappings in the per cpu area are protected - * by guard pages so each stack must be mapped separately. + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. */ cea_map_stack(DF); cea_map_stack(NMI); + cea_map_stack(DB1); cea_map_stack(DB); cea_map_stack(MCE); } -- cgit v1.2.3-59-g8ed1b From 758a2e312228410f2f5092ade558109e93dc3ee8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:02 +0200 Subject: x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr Preparatory patch to share code with 32bit. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Pingfan Liu Cc: Sean Christopherson Cc: Stephen Rothwell Cc: Vlastimil Babka Cc: x86-ml Link: https://lkml.kernel.org/r/20190414160145.912584074@linutronix.de --- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/dumpstack_64.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ee649f1f279e..726abbe6c6d8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -431,7 +431,7 @@ END(irq_entries_start) */ movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0b7d866a1ad5..5e3dd4e2136d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -396,7 +396,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); } -DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(char *, hardirq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 88cab45707a9..13ec72bb8f36 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1510,7 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = +DEFINE_PER_CPU(char *, hardirq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f356d3ea0c70..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -120,7 +120,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7eb6f8d11bfd..f0c7356c8969 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -56,7 +56,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__this_cpu_read(irq_stack_ptr); + irq_stack_top = (u64)__this_cpu_read(hardirq_stack_ptr); irq_stack_bottom = irq_stack_top - IRQ_STACK_SIZE + STACK_MARGIN; if (regs->sp >= irq_stack_bottom && regs->sp <= irq_stack_top) return; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..657343ecc2da 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -245,7 +245,7 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif #ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = + per_cpu(hardirq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE; #endif -- cgit v1.2.3-59-g8ed1b From 0ac26104208450d35c4e68754ce0c67b3a4d7802 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 18:00:05 +0200 Subject: x86/irq/64: Init hardirq_stack_ptr during CPU hotplug Preparatory change for disentangling the irq stack union as a prerequisite for irq stacks with guard pages. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Konrad Rzeszutek Wilk Cc: Nicolai Stange Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Sean Christopherson Cc: x86-ml Cc: Yi Wang Link: https://lkml.kernel.org/r/20190414160146.177558566@linutronix.de --- arch/x86/include/asm/irq.h | 4 ---- arch/x86/kernel/cpu/common.c | 4 +--- arch/x86/kernel/irq_64.c | 15 +++++++++++++++ 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index d751e8440a6b..8f95686ec27e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_32 extern int irq_init_percpu_irqstack(unsigned int cpu); -#else -static inline int irq_init_percpu_irqstack(unsigned int cpu) { return 0; } -#endif #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 13ec72bb8f36..1222080838da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1510,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, hardirq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(char *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f0c7356c8969..c0bea0d7d76a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,3 +87,18 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) generic_handle_irq_desc(desc); return true; } + +static int map_irq_stack(unsigned int cpu) +{ + void *va = per_cpu_ptr(irq_stack_union.irq_stack, cpu); + + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} + +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); +} -- cgit v1.2.3-59-g8ed1b From e6401c13093173aad709a5c6de00cf8d692ee786 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 14 Apr 2019 18:00:06 +0200 Subject: x86/irq/64: Split the IRQ stack into its own pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the IRQ stack is hardcoded as the first page of the percpu area, and the stack canary lives on the IRQ stack. The former gets in the way of adding an IRQ stack guard page, and the latter is a potential weakness in the stack canary mechanism. Split the IRQ stack into its own private percpu pages. [ tglx: Make 64 and 32 bit share struct irq_stack ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Brijesh Singh Cc: "Chang S. Bae" Cc: Dominik Brodowski Cc: Feng Tang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jan Beulich Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jordan Borgner Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Maran Wilson Cc: Masahiro Yamada Cc: Michal Hocko Cc: Mike Rapoport Cc: Nick Desaulniers Cc: Nicolai Stange Cc: Peter Zijlstra Cc: Pu Wen Cc: "Rafael Ávila de Espíndola" Cc: Sean Christopherson Cc: Stefano Stabellini Cc: Vlastimil Babka Cc: x86-ml Cc: xen-devel@lists.xenproject.org Link: https://lkml.kernel.org/r/20190414160146.267376656@linutronix.de --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/processor.h | 32 ++++++++++++++------------------ arch/x86/include/asm/stackprotector.h | 6 +++--- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/cpu/common.c | 8 ++++---- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/irq_64.c | 5 ++++- arch/x86/kernel/setup_percpu.c | 5 ----- arch/x86/kernel/vmlinux.lds.S | 7 ++++--- arch/x86/tools/relocs.c | 2 +- arch/x86/xen/xen-head.S | 10 +++++----- 11 files changed, 39 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 726abbe6c6d8..cfe4d6ea258d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,7 +430,7 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5e3dd4e2136d..7e99ef67bff0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + char stack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -375,28 +382,24 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #endif #ifdef CONFIG_X86_64 -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + char gs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, hardirq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -418,14 +421,7 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack { - char stack[IRQ_STACK_SIZE]; -} __aligned(IRQ_STACK_SIZE); - -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +/* Per CPU softirq stack pointer */ DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..91e29b6a86a5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -13,7 +13,7 @@ * On x86_64, %gs is shared by percpu area and stack canary. All * percpu symbols are zero based and %gs points to the base of percpu * area. The first occupant of the percpu area is always - * irq_stack_union which contains stack_canary at offset 40. Userland + * fixed_percpu_data which contains stack_canary at offset 40. Userland * %gs is always saved and restored on kernel entry and exit using * swapgs, so stack protector doesn't add any complexity there. * @@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void) u64 tsc; #ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif /* * We both use the random pool and the current TSC as a source @@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - this_cpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(fixed_percpu_data.stack_canary, canary); #else this_cpu_write(stack_canary.canary, canary); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index f5281567e28e..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -73,7 +73,7 @@ int main(void) BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1222080838da..801c6f040faa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1498,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1510,7 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c0bea0d7d76a..c0f89d136b80 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -23,6 +23,9 @@ #include #include +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); + int sysctl_panic_on_stackoverflow; /* @@ -90,7 +93,7 @@ bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) static int map_irq_stack(unsigned int cpu) { - void *va = per_cpu_ptr(irq_stack_union.irq_stack, cpu); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; return 0; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 657343ecc2da..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(hardirq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..a5af9a7c4be4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b629f6992d9f..efa483205e43 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -738,7 +738,7 @@ static void percpu_init(void) * __per_cpu_load * * The "gold" linker incorrectly associates: - * init_per_cpu__irq_stack_union + * init_per_cpu__fixed_percpu_data * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 5077ead5e59c..c1d8b90aa4e2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -40,13 +40,13 @@ ENTRY(startup_xen) #ifdef CONFIG_X86_64 /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax cdq wrmsr #endif -- cgit v1.2.3-59-g8ed1b