aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-12-01 19:05:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-12-01 19:05:07 -0800
commite5b3fc125d768eacd73bb4dc5019f0ce95635af4 (patch)
tree4f7e06f8a0493865a6b604bbcef5118c4582ebcf /arch/x86/kernel
parentMerge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentx86/mm/pat: Fix off-by-one bugs in interval tree search (diff)
downloadlinux-dev-e5b3fc125d768eacd73bb4dc5019f0ce95635af4.tar.xz
linux-dev-e5b3fc125d768eacd73bb4dc5019f0ce95635af4.zip
Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Ingo Molnar: "Various fixes: - Fix the PAT performance regression that downgraded write-combining device memory regions to uncached. - There's been a number of bugs in 32-bit double fault handling - hopefully all fixed now. - Fix an LDT crash - Fix an FPU over-optimization that broke with GCC9 code optimizations. - Misc cleanups" * 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/pat: Fix off-by-one bugs in interval tree search x86/ioperm: Save an indentation level in tss_update_io_bitmap() x86/fpu: Don't cache access to fpu_fpregs_owner_ctx x86/entry/32: Remove unused 'restore_all_notrace' local label x86/ptrace: Document FSBASE and GSBASE ABI oddities x86/ptrace: Remove set_segment_reg() implementations for current x86/traps: die() instead of panicking on a double fault x86/doublefault/32: Rewrite the x86_32 #DF handler and unify with 64-bit x86/doublefault/32: Move #DF stack and TSS to cpu_entry_area x86/doublefault/32: Rename doublefault.c to doublefault_32.c x86/traps: Disentangle the 32-bit and 64-bit doublefault code lkdtm: Add a DOUBLE_FAULT crash type on x86 selftests/x86/single_step_syscall: Check SYSENTER directly x86/mm/32: Sync only to VMALLOC_END in vmalloc_sync_all()
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/cpu/common.c12
-rw-r--r--arch/x86/kernel/doublefault.c86
-rw-r--r--arch/x86/kernel/doublefault_32.c136
-rw-r--r--arch/x86/kernel/dumpstack_32.c30
-rw-r--r--arch/x86/kernel/process.c52
-rw-r--r--arch/x86/kernel/ptrace.c36
-rw-r--r--arch/x86/kernel/traps.c31
8 files changed, 241 insertions, 146 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 32acb970f416..6175e370ee4a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -100,7 +100,9 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
+ifeq ($(CONFIG_X86_32),y)
+obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+endif
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index baa2fed8deb6..2e4d90294fe6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -24,6 +24,7 @@
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -1814,8 +1815,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
}
-static inline void gdt_setup_doublefault_tss(int cpu) { }
-
#else /* CONFIG_X86_64 */
static inline void setup_getcpu(int cpu) { }
@@ -1827,13 +1826,6 @@ static inline void ucode_cpu_init(int cpu)
static inline void tss_setup_ist(struct tss_struct *tss) { }
-static inline void gdt_setup_doublefault_tss(int cpu)
-{
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up the doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
-}
#endif /* !CONFIG_X86_64 */
static inline void tss_setup_io_bitmap(struct tss_struct *tss)
@@ -1923,7 +1915,7 @@ void cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();
- gdt_setup_doublefault_tss(cpu);
+ doublefault_init_cpu_tss();
fpu__init_cpu();
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
deleted file mode 100644
index 0d6c657593f8..000000000000
--- a/arch/x86/kernel/doublefault.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/init_task.h>
-#include <linux/fs.h>
-
-#include <linux/uaccess.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-
-#ifdef CONFIG_X86_32
-
-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
-
-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
-
-static void doublefault_fn(void)
-{
- struct desc_ptr gdt_desc = {0, 0};
- unsigned long gdt, tss;
-
- native_store_gdt(&gdt_desc);
- gdt = gdt_desc.address;
-
- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
-
- if (ptr_ok(gdt)) {
- gdt += GDT_ENTRY_TSS << 3;
- tss = get_desc_base((struct desc_struct *)gdt);
- printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
-
- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
-
- printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
- t->ip, t->sp);
-
- printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
- t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
- }
- }
-
- for (;;)
- cpu_relax();
-}
-
-struct x86_hw_tss doublefault_tss __cacheline_aligned = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-#ifndef CONFIG_X86_32_LAZY_GS
- .gs = __KERNEL_STACK_CANARY,
-#endif
-
- .__cr3 = __pa_nodebug(swapper_pg_dir),
-};
-
-/* dummy for do_double_fault() call */
-void df_debug(struct pt_regs *regs, long error_code) {}
-
-#else /* !CONFIG_X86_32 */
-
-void df_debug(struct pt_regs *regs, long error_code)
-{
- pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
- show_regs(regs);
- panic("Machine halted.");
-}
-#endif
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
new file mode 100644
index 000000000000..3793646f0fb5
--- /dev/null
+++ b/arch/x86/kernel/doublefault_32.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+extern void double_fault(void);
+#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+
+#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
+
+static void set_df_gdt_entry(unsigned int cpu);
+
+/*
+ * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
+ * we're running the doublefault task. Cannot return.
+ */
+asmlinkage notrace void __noreturn doublefault_shim(void)
+{
+ unsigned long cr2;
+ struct pt_regs regs;
+
+ BUILD_BUG_ON(sizeof(struct doublefault_stack) != PAGE_SIZE);
+
+ cr2 = native_read_cr2();
+
+ /* Reset back to the normal kernel task. */
+ force_reload_TR();
+ set_df_gdt_entry(smp_processor_id());
+
+ trace_hardirqs_off();
+
+ /*
+ * Fill in pt_regs. A downside of doing this in C is that the unwinder
+ * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
+ * won't successfully unwind to the source of the double fault.
+ * The main dump from do_double_fault() is fine, though, since it
+ * uses these regs directly.
+ *
+ * If anyone ever cares, this could be moved to asm.
+ */
+ regs.ss = TSS(ss);
+ regs.__ssh = 0;
+ regs.sp = TSS(sp);
+ regs.flags = TSS(flags);
+ regs.cs = TSS(cs);
+ /* We won't go through the entry asm, so we can leave __csh as 0. */
+ regs.__csh = 0;
+ regs.ip = TSS(ip);
+ regs.orig_ax = 0;
+ regs.gs = TSS(gs);
+ regs.__gsh = 0;
+ regs.fs = TSS(fs);
+ regs.__fsh = 0;
+ regs.es = TSS(es);
+ regs.__esh = 0;
+ regs.ds = TSS(ds);
+ regs.__dsh = 0;
+ regs.ax = TSS(ax);
+ regs.bp = TSS(bp);
+ regs.di = TSS(di);
+ regs.si = TSS(si);
+ regs.dx = TSS(dx);
+ regs.cx = TSS(cx);
+ regs.bx = TSS(bx);
+
+ do_double_fault(&regs, 0, cr2);
+
+ /*
+ * x86_32 does not save the original CR3 anywhere on a task switch.
+ * This means that, even if we wanted to return, we would need to find
+ * some way to reconstruct CR3. We could make a credible guess based
+ * on cpu_tlbstate, but that would be racy and would not account for
+ * PTI.
+ *
+ * Instead, don't bother. We can return through
+ * rewind_stack_do_exit() instead.
+ */
+ panic("cannot return from double fault\n");
+}
+NOKPROBE_SYMBOL(doublefault_shim);
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
+ .tss = {
+ /*
+ * No sp0 or ss0 -- we never run CPL != 0 with this TSS
+ * active. sp is filled in later.
+ */
+ .ldt = 0,
+ .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
+
+ .ip = (unsigned long) double_fault,
+ .flags = X86_EFLAGS_FIXED,
+ .es = __USER_DS,
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ds = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+#ifndef CONFIG_X86_32_LAZY_GS
+ .gs = __KERNEL_STACK_CANARY,
+#endif
+
+ .__cr3 = __pa_nodebug(swapper_pg_dir),
+ },
+};
+
+static void set_df_gdt_entry(unsigned int cpu)
+{
+ /* Set up doublefault TSS pointer in the GDT */
+ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS,
+ &get_cpu_entry_area(cpu)->doublefault_stack.tss);
+
+}
+
+void doublefault_init_cpu_tss(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
+
+ /*
+ * The linker isn't smart enough to initialize percpu variables that
+ * point to other places in percpu space.
+ */
+ this_cpu_write(doublefault_stack.tss.sp,
+ (unsigned long)&cea->doublefault_stack.stack +
+ sizeof(doublefault_stack.stack));
+
+ set_df_gdt_entry(cpu);
+}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 64a59d726639..8e3a8fedfa4d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -29,6 +29,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE";
+ if (type == STACK_TYPE_EXCEPTION)
+ return "#DF";
+
return NULL;
}
@@ -82,6 +85,30 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
return true;
}
+static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
+{
+#ifdef CONFIG_DOUBLEFAULT
+ struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
+ struct doublefault_stack *ss = &cea->doublefault_stack;
+
+ void *begin = ss->stack;
+ void *end = begin + sizeof(ss->stack);
+
+ if ((void *)stack < begin || (void *)stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_EXCEPTION;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
+
+ return true;
+#else
+ return false;
+#endif
+}
+
+
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask)
{
@@ -105,6 +132,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_softirq_stack(stack, info))
goto recursion_check;
+ if (in_doublefault_stack(stack, info))
+ goto recursion_check;
+
goto unknown;
recursion_check:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index bd2a11ca5dd6..61e93a318983 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -377,37 +377,37 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
void tss_update_io_bitmap(void)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ struct thread_struct *t = &current->thread;
u16 *base = &tss->x86_tss.io_bitmap_base;
- if (test_thread_flag(TIF_IO_BITMAP)) {
- struct thread_struct *t = &current->thread;
-
- if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
- *base = IO_BITMAP_OFFSET_VALID_ALL;
- } else {
- struct io_bitmap *iobm = t->io_bitmap;
- /*
- * Only copy bitmap data when the sequence number
- * differs. The update time is accounted to the
- * incoming task.
- */
- if (tss->io_bitmap.prev_sequence != iobm->sequence)
- tss_copy_io_bitmap(tss, iobm);
-
- /* Enable the bitmap */
- *base = IO_BITMAP_OFFSET_VALID_MAP;
- }
+ if (!test_thread_flag(TIF_IO_BITMAP)) {
+ tss_invalidate_io_bitmap(tss);
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
+ *base = IO_BITMAP_OFFSET_VALID_ALL;
+ } else {
+ struct io_bitmap *iobm = t->io_bitmap;
+
/*
- * Make sure that the TSS limit is covering the io bitmap.
- * It might have been cut down by a VMEXIT to 0x67 which
- * would cause a subsequent I/O access from user space to
- * trigger a #GP because tbe bitmap is outside the TSS
- * limit.
+ * Only copy bitmap data when the sequence number differs. The
+ * update time is accounted to the incoming task.
*/
- refresh_tss_limit();
- } else {
- tss_invalidate_io_bitmap(tss);
+ if (tss->io_bitmap.prev_sequence != iobm->sequence)
+ tss_copy_io_bitmap(tss, iobm);
+
+ /* Enable the bitmap */
+ *base = IO_BITMAP_OFFSET_VALID_MAP;
}
+
+ /*
+ * Make sure that the TSS limit is covering the IO bitmap. It might have
+ * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
+ * access from user space to trigger a #GP because tbe bitmap is outside
+ * the TSS limit.
+ */
+ refresh_tss_limit();
}
#else /* CONFIG_X86_IOPL_IOPERM */
static inline void switch_to_bitmap(unsigned long tifp) { }
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 066e5b01a7e0..f0e1ddbc2fd7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -182,6 +182,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
@@ -209,10 +212,7 @@ static int set_segment_reg(struct task_struct *task,
break;
case offsetof(struct user_regs_struct, gs):
- if (task == current)
- set_user_gs(task_pt_regs(task), value);
- else
- task_user_gs(task) = value;
+ task_user_gs(task) = value;
}
return 0;
@@ -272,32 +272,41 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
unsigned long offset, u16 value)
{
+ if (WARN_ON_ONCE(task == current))
+ return -EIO;
+
/*
* The value argument was already truncated to 16 bits.
*/
if (invalid_selector(value))
return -EIO;
+ /*
+ * This function has some ABI oddities.
+ *
+ * A 32-bit ptracer probably expects that writing FS or GS will change
+ * FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
+ * this code indeed has that effect. When FSGSBASE is added, this
+ * will require a special case.
+ *
+ * For existing 64-bit ptracers, writing FS or GS *also* currently
+ * changes the base if the selector is nonzero the next time the task
+ * is run. This behavior may not be needed, and trying to preserve it
+ * when FSGSBASE is added would be complicated at best.
+ */
+
switch (offset) {
case offsetof(struct user_regs_struct,fs):
task->thread.fsindex = value;
- if (task == current)
- loadsegment(fs, task->thread.fsindex);
break;
case offsetof(struct user_regs_struct,gs):
task->thread.gsindex = value;
- if (task == current)
- load_gs_index(task->thread.gsindex);
break;
case offsetof(struct user_regs_struct,ds):
task->thread.ds = value;
- if (task == current)
- loadsegment(ds, task->thread.ds);
break;
case offsetof(struct user_regs_struct,es):
task->thread.es = value;
- if (task == current)
- loadsegment(es, task->thread.es);
break;
/*
@@ -375,6 +384,9 @@ static int putreg(struct task_struct *child,
* When changing the FS base, use do_arch_prctl_64()
* to set the index to zero and to set the base
* as requested.
+ *
+ * NB: This behavior is nonsensical and likely needs to
+ * change when FSGSBASE support is added.
*/
if (child->thread.fsbase != value)
return do_arch_prctl_64(child, ARCH_SET_FS, value);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c90312146da0..05da6b5b167b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,8 +306,23 @@ __visible void __noreturn handle_stack_overflow(const char *message,
}
#endif
-#ifdef CONFIG_X86_64
-/* Runs on IST stack */
+#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
+/*
+ * Runs on an IST stack for x86_64 and on a special task stack for x86_32.
+ *
+ * On x86_64, this is more or less a normal kernel entry. Notwithstanding the
+ * SDM's warnings about double faults being unrecoverable, returning works as
+ * expected. Presumably what the SDM actually means is that the CPU may get
+ * the register state wrong on entry, so returning could be a bad idea.
+ *
+ * Various CPU engineers have promised that double faults due to an IRET fault
+ * while the stack is read-only are, in fact, recoverable.
+ *
+ * On x86_32, this is entered through a task gate, and regs are synthesized
+ * from the TSS. Returning is, in principle, okay, but changes to regs will
+ * be lost. If, for some reason, we need to return to a context with modified
+ * regs, the shim code could be adjusted to synchronize the registers.
+ */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
{
static const char str[] = "double fault";
@@ -411,15 +426,9 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
#endif
-#ifdef CONFIG_DOUBLEFAULT
- df_debug(regs, error_code);
-#endif
- /*
- * This is always a kernel trap and never fixable (and thus must
- * never return).
- */
- for (;;)
- die(str, regs, error_code);
+ pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
+ die("double fault", regs, error_code);
+ panic("Machine halted.");
}
#endif