aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/header.S6
-rw-r--r--arch/x86/events/core.c20
-rw-r--r--arch/x86/events/intel/core.c68
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/include/asm/fpu/internal.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/nospec-branch.h26
-rw-r--r--arch/x86/include/asm/spec-ctrl.h20
-rw-r--r--arch/x86/include/asm/switch_to.h3
-rw-r--r--arch/x86/include/asm/thread_info.h20
-rw-r--r--arch/x86/include/asm/tlbflush.h8
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h7
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c525
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c19
-rw-r--r--arch/x86/kernel/fpu/signal.c4
-rw-r--r--arch/x86/kernel/ftrace.c15
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/process.c101
-rw-r--r--arch/x86/kernel/process.h39
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/setup.c17
-rw-r--r--arch/x86/kvm/lapic.c7
-rw-r--r--arch/x86/kvm/mmu.c27
-rw-r--r--arch/x86/kvm/svm.c44
-rw-r--r--arch/x86/kvm/vmx.c98
-rw-r--r--arch/x86/kvm/x86.c10
-rw-r--r--arch/x86/mm/tlb.c115
-rw-r--r--arch/x86/xen/enlighten.c78
-rw-r--r--arch/x86/xen/multicalls.c35
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/spinlock.c7
38 files changed, 882 insertions, 510 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9d734f3c8234..8689e794a43c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -444,10 +444,6 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
- Without compiler support, at least indirect branches in assembler
- code are eliminated. Since this includes the syscall entry path,
- it is not entirely pointless.
-
config INTEL_RDT
bool "Intel Resource Director Technology support"
depends on X86 && CPU_SUP_INTEL
@@ -1004,13 +1000,7 @@ config NR_CPUS
to the kernel image.
config SCHED_SMT
- bool "SMT (Hyperthreading) scheduler support"
- depends on SMP
- ---help---
- SMT scheduler support improves the CPU scheduler's decision making
- when dealing with Intel Pentium 4 chips with HyperThreading at a
- cost of slightly increased overhead in some places. If unsure say
- N here.
+ def_bool y if SMP
config SCHED_MC
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 88398fdf8129..f5d7f4134524 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -220,9 +220,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
ifdef CONFIG_RETPOLINE
-ifneq ($(RETPOLINE_CFLAGS),)
- KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+ifeq ($(RETPOLINE_CFLAGS),)
+ $(error You are building kernel with non-retpoline compiler, please update your compiler.)
endif
+ KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
endif
archscripts: scripts_basic
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 4c881c850125..850b8762e889 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -300,7 +300,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x020e # header version number (>= 0x0105)
+ .word 0x020d # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -558,10 +558,6 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
init_size: .long INIT_SIZE # kernel initialization size
handover_offset: .long 0 # Filled in by build.c
-acpi_rsdp_addr: .quad 0 # 64-bit physical pointer to the
- # ACPI RSDP table, added with
- # version 2.14
-
# End of setup header #####################################################
.section ".entrytext", "ax"
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 106911b603bd..374a19712e20 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -438,26 +438,6 @@ int x86_setup_perfctr(struct perf_event *event)
if (config == -1LL)
return -EINVAL;
- /*
- * Branch tracing:
- */
- if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
- !attr->freq && hwc->sample_period == 1) {
- /* BTS is not supported by this architecture. */
- if (!x86_pmu.bts_active)
- return -EOPNOTSUPP;
-
- /* BTS is currently only allowed for user-mode. */
- if (!attr->exclude_kernel)
- return -EOPNOTSUPP;
-
- /* disallow bts if conflicting events are present */
- if (x86_add_exclusive(x86_lbr_exclusive_lbr))
- return -EBUSY;
-
- event->destroy = hw_perf_lbr_event_destroy;
- }
-
hwc->config |= config;
return 0;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 273c62e81546..ecc3e34ca955 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
return handled;
}
-static bool disable_counter_freezing;
+static bool disable_counter_freezing = true;
static int __init intel_perf_counter_freezing_setup(char *s)
{
- disable_counter_freezing = true;
- pr_info("Intel PMU Counter freezing feature disabled\n");
+ bool res;
+
+ if (kstrtobool(s, &res))
+ return -EINVAL;
+
+ disable_counter_freezing = !res;
return 1;
}
-__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
/*
* Simplified handler for Arch Perfmon v4:
@@ -2470,16 +2474,7 @@ done:
static struct event_constraint *
intel_bts_constraints(struct perf_event *event)
{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int hw_event, bts_event;
-
- if (event->attr.freq)
- return NULL;
-
- hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
- bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
- if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+ if (unlikely(intel_pmu_has_bts(event)))
return &bts_constraint;
return NULL;
@@ -3098,6 +3093,43 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
return flags;
}
+static int intel_pmu_bts_config(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (unlikely(intel_pmu_has_bts(event))) {
+ /* BTS is not supported by this architecture. */
+ if (!x86_pmu.bts_active)
+ return -EOPNOTSUPP;
+
+ /* BTS is currently only allowed for user-mode. */
+ if (!attr->exclude_kernel)
+ return -EOPNOTSUPP;
+
+ /* BTS is not allowed for precise events. */
+ if (attr->precise_ip)
+ return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
+
+ return 0;
+}
+
+static int core_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ return intel_pmu_bts_config(event);
+}
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3105,6 +3137,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (ret)
return ret;
+ ret = intel_pmu_bts_config(event);
+ if (ret)
+ return ret;
+
if (event->attr.precise_ip) {
if (!event->attr.freq) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
@@ -3127,7 +3163,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
/*
* BTS is set up earlier in this path, so don't account twice
*/
- if (!intel_pmu_has_bts(event)) {
+ if (!unlikely(intel_pmu_has_bts(event))) {
/* disallow lbr if conflicting events are present */
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
return -EBUSY;
@@ -3596,7 +3632,7 @@ static __initconst const struct x86_pmu core_pmu = {
.enable_all = core_pmu_enable_all,
.enable = core_pmu_enable_event,
.disable = x86_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
+ .hw_config = core_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index adae087cecdd..78d7b7031bfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -859,11 +859,16 @@ static inline int amd_pmu_init(void)
static inline bool intel_pmu_has_bts(struct perf_event *event)
{
- if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
- !event->attr.freq && event->hw.sample_period == 1)
- return true;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int hw_event, bts_event;
+
+ if (event->attr.freq)
+ return false;
+
+ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
- return false;
+ return hw_event == bts_event && hwc->sample_period == 1;
}
int intel_pmu_save_and_restart(struct perf_event *event);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 5f7290e6e954..69dcdf195b61 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -226,7 +226,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
"3: movl $-2,%[err]\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE(1b, 3b) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 55e51ff7e421..fbda5a917c5b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1094,7 +1094,8 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu);
- void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ /* Returns actual tsc_offset set in active VMCS */
+ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 80f4a4f38c79..c8f73efb4ece 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -41,9 +41,10 @@
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
-#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
+#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
+#define SPEC_CTRL_STIBP (1 << SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
+#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 80dc14422495..032b6009baab 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -3,6 +3,8 @@
#ifndef _ASM_X86_NOSPEC_BRANCH_H_
#define _ASM_X86_NOSPEC_BRANCH_H_
+#include <linux/static_key.h>
+
#include <asm/alternative.h>
#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
@@ -162,11 +164,12 @@
_ASM_PTR " 999b\n\t" \
".popsection\n\t"
-#if defined(CONFIG_X86_64) && defined(RETPOLINE)
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_64
/*
- * Since the inline asm uses the %V modifier which is only in newer GCC,
- * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
+ * Inline asm uses the %V modifier which is only in newer GCC
+ * which is ensured when CONFIG_RETPOLINE is defined.
*/
# define CALL_NOSPEC \
ANNOTATE_NOSPEC_ALTERNATIVE \
@@ -181,7 +184,7 @@
X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
-#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
+#else /* CONFIG_X86_32 */
/*
* For i386 we use the original ret-equivalent retpoline, because
* otherwise we'll run out of registers. We don't care about CET
@@ -211,6 +214,7 @@
X86_FEATURE_RETPOLINE_AMD)
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
#else /* No retpoline for C / inline asm */
# define CALL_NOSPEC "call *%[thunk_target]\n"
# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
@@ -219,13 +223,19 @@
/* The Spectre V2 mitigation variants */
enum spectre_v2_mitigation {
SPECTRE_V2_NONE,
- SPECTRE_V2_RETPOLINE_MINIMAL,
- SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
SPECTRE_V2_IBRS_ENHANCED,
};
+/* The indirect branch speculation control variants */
+enum spectre_v2_user_mitigation {
+ SPECTRE_V2_USER_NONE,
+ SPECTRE_V2_USER_STRICT,
+ SPECTRE_V2_USER_PRCTL,
+ SPECTRE_V2_USER_SECCOMP,
+};
+
/* The Speculative Store Bypass disable variants */
enum ssb_mitigation {
SPEC_STORE_BYPASS_NONE,
@@ -303,6 +313,10 @@ do { \
preempt_enable(); \
} while (0)
+DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
#endif /* __ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
index ae7c2c5cd7f0..5393babc0598 100644
--- a/arch/x86/include/asm/spec-ctrl.h
+++ b/arch/x86/include/asm/spec-ctrl.h
@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
+static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
{
BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
}
+static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
+{
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
+ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
+}
+
static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
{
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
static inline void speculative_store_bypass_ht_init(void) { }
#endif
-extern void speculative_store_bypass_update(unsigned long tif);
-
-static inline void speculative_store_bypass_update_current(void)
-{
- speculative_store_bypass_update(current_thread_info()->flags);
-}
+extern void speculation_ctrl_update(unsigned long tif);
+extern void speculation_ctrl_update_current(void);
#endif
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 36bd243843d6..7cf1a270d891 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -11,9 +11,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
-struct tss_struct;
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *next)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2ff2a30a264f..82b73b75d67c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -79,10 +79,12 @@ struct thread_info {
#define TIF_SIGPENDING 2 /* signal pending */
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
-#define TIF_SSBD 5 /* Reduced data speculation */
+#define TIF_SSBD 5 /* Speculative store bypass disable */
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
+#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
+#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_PATCH_PENDING 13 /* pending live patching update */
@@ -110,6 +112,8 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
+#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
+#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING)
@@ -145,8 +149,18 @@ struct thread_info {
_TIF_FSCHECK)
/* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
+#define _TIF_WORK_CTXSW_BASE \
+ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP| \
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
+
+/*
+ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
+ */
+#ifdef CONFIG_SMP
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
+#else
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
+#endif
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d760611cfc35..f4204bf377fc 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -169,10 +169,14 @@ struct tlb_state {
#define LOADED_MM_SWITCHING ((struct mm_struct *)1)
+ /* Last user mm for optimizing IBPB */
+ union {
+ struct mm_struct *last_user_mm;
+ unsigned long last_user_mm_ibpb;
+ };
+
u16 loaded_mm_asid;
u16 next_asid;
- /* last user mm's ctx id */
- u64 last_ctx_id;
/*
* We can be in one of several states:
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 0f842104862c..b85a7c54c6a1 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -303,6 +303,4 @@ extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
extern bool x86_pnpbios_disabled(void);
-void x86_verify_bootdata_version(void);
-
#endif
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 22f89d040ddd..60733f137e9a 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -16,9 +16,6 @@
#define RAMDISK_PROMPT_FLAG 0x8000
#define RAMDISK_LOAD_FLAG 0x4000
-/* version flags */
-#define VERSION_WRITTEN 0x8000
-
/* loadflags */
#define LOADED_HIGH (1<<0)
#define KASLR_FLAG (1<<1)
@@ -89,7 +86,6 @@ struct setup_header {
__u64 pref_address;
__u32 init_size;
__u32 handover_offset;
- __u64 acpi_rsdp_addr;
} __attribute__((packed));
struct sys_desc_table {
@@ -159,7 +155,8 @@ struct boot_params {
__u8 _pad2[4]; /* 0x054 */
__u64 tboot_addr; /* 0x058 */
struct ist_info ist_info; /* 0x060 */
- __u8 _pad3[16]; /* 0x070 */
+ __u64 acpi_rsdp_addr; /* 0x070 */
+ __u8 _pad3[8]; /* 0x078 */
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 92c76bf97ad8..06635fbca81c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1776,5 +1776,5 @@ void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
u64 x86_default_get_root_pointer(void)
{
- return boot_params.hdr.acpi_rsdp_addr;
+ return boot_params.acpi_rsdp_addr;
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c37e66e493bf..500278f5308e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
+#include <linux/sched/smt.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -53,6 +54,13 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
u64 __ro_after_init x86_amd_ls_cfg_base;
u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+/* Control conditional STIPB in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -123,31 +131,6 @@ void __init check_bugs(void)
#endif
}
-/* The kernel command line selection */
-enum spectre_v2_mitigation_cmd {
- SPECTRE_V2_CMD_NONE,
- SPECTRE_V2_CMD_AUTO,
- SPECTRE_V2_CMD_FORCE,
- SPECTRE_V2_CMD_RETPOLINE,
- SPECTRE_V2_CMD_RETPOLINE_GENERIC,
- SPECTRE_V2_CMD_RETPOLINE_AMD,
-};
-
-static const char *spectre_v2_strings[] = {
- [SPECTRE_V2_NONE] = "Vulnerable",
- [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
- [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
-};
-
-#undef pr_fmt
-#define pr_fmt(fmt) "Spectre V2 : " fmt
-
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
-
void
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
{
@@ -169,6 +152,10 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
static_cpu_has(X86_FEATURE_AMD_SSBD))
hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
+ /* Conditional STIBP enabled? */
+ if (static_branch_unlikely(&switch_to_cond_stibp))
+ hostval |= stibp_tif_to_spec_ctrl(ti->flags);
+
if (hostval != guestval) {
msrval = setguest ? guestval : hostval;
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
@@ -202,7 +189,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
ssbd_spec_ctrl_to_tif(hostval);
- speculative_store_bypass_update(tif);
+ speculation_ctrl_update(tif);
}
}
EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
@@ -217,6 +204,15 @@ static void x86_amd_ssb_disable(void)
wrmsrl(MSR_AMD64_LS_CFG, msrval);
}
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
+ SPECTRE_V2_NONE;
+
+static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
#ifdef RETPOLINE
static bool spectre_v2_bad_module;
@@ -238,67 +234,217 @@ static inline const char *spectre_v2_module_string(void)
static inline const char *spectre_v2_module_string(void) { return ""; }
#endif
-static void __init spec2_print_if_insecure(const char *reason)
+static inline bool match_option(const char *arg, int arglen, const char *opt)
{
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s selected on command line.\n", reason);
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
}
-static void __init spec2_print_if_secure(const char *reason)
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+enum spectre_v2_user_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static const struct {
+ const char *option;
+ enum spectre_v2_user_cmd cmd;
+ bool secure;
+} v2_user_options[] __initdata = {
+ { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
+ { "off", SPECTRE_V2_USER_CMD_NONE, false },
+ { "on", SPECTRE_V2_USER_CMD_FORCE, true },
+ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
+ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
+ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
+ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
+};
+
+static void __init spec_v2_user_print_cond(const char *reason, bool secure)
{
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
- pr_info("%s selected on command line.\n", reason);
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("spectre_v2_user=%s forced on command line.\n", reason);
}
-static inline bool retp_compiler(void)
+static enum spectre_v2_user_cmd __init
+spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
{
- return __is_defined(RETPOLINE);
+ char arg[20];
+ int ret, i;
+
+ switch (v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return SPECTRE_V2_USER_CMD_NONE;
+ case SPECTRE_V2_CMD_FORCE:
+ return SPECTRE_V2_USER_CMD_FORCE;
+ default:
+ break;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
+ arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_USER_CMD_AUTO;
+
+ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
+ if (match_option(arg, ret, v2_user_options[i].option)) {
+ spec_v2_user_print_cond(v2_user_options[i].option,
+ v2_user_options[i].secure);
+ return v2_user_options[i].cmd;
+ }
+ }
+
+ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_USER_CMD_AUTO;
}
-static inline bool match_option(const char *arg, int arglen, const char *opt)
+static void __init
+spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
{
- int len = strlen(opt);
+ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
+ bool smt_possible = IS_ENABLED(CONFIG_SMP);
+ enum spectre_v2_user_cmd cmd;
- return len == arglen && !strncmp(arg, opt, len);
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
+ smt_possible = false;
+
+ cmd = spectre_v2_parse_user_cmdline(v2_cmd);
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ goto set_mode;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ mode = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ mode = SPECTRE_V2_USER_SECCOMP;
+ else
+ mode = SPECTRE_V2_USER_PRCTL;
+ break;
+ }
+
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
+
+ switch (cmd) {
+ case SPECTRE_V2_USER_CMD_FORCE:
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ case SPECTRE_V2_USER_CMD_AUTO:
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+
+ /* If enhanced IBRS is enabled no STIPB required */
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return;
+
+ /*
+ * If SMT is not possible or STIBP is not available clear the STIPB
+ * mode.
+ */
+ if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
+ mode = SPECTRE_V2_USER_NONE;
+set_mode:
+ spectre_v2_user = mode;
+ /* Only print the STIBP mode when SMT possible */
+ if (smt_possible)
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
}
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
+ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
+ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
+};
+
static const struct {
const char *option;
enum spectre_v2_mitigation_cmd cmd;
bool secure;
-} mitigation_options[] = {
- { "off", SPECTRE_V2_CMD_NONE, false },
- { "on", SPECTRE_V2_CMD_FORCE, true },
- { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
- { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
- { "auto", SPECTRE_V2_CMD_AUTO, false },
+} mitigation_options[] __initdata = {
+ { "off", SPECTRE_V2_CMD_NONE, false },
+ { "on", SPECTRE_V2_CMD_FORCE, true },
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
};
+static void __init spec_v2_print_cond(const char *reason, bool secure)
+{
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
+ pr_info("%s selected on command line.\n", reason);
+}
+
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
char arg[20];
int ret, i;
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
return SPECTRE_V2_CMD_NONE;
- else {
- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
- if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
- for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
- if (!match_option(arg, ret, mitigation_options[i].option))
- continue;
- cmd = mitigation_options[i].cmd;
- break;
- }
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
+ if (ret < 0)
+ return SPECTRE_V2_CMD_AUTO;
- if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
- }
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+ if (!match_option(arg, ret, mitigation_options[i].option))
+ continue;
+ cmd = mitigation_options[i].cmd;
+ break;
+ }
+
+ if (i >= ARRAY_SIZE(mitigation_options)) {
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
+ return SPECTRE_V2_CMD_AUTO;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -316,54 +462,11 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (mitigation_options[i].secure)
- spec2_print_if_secure(mitigation_options[i].option);
- else
- spec2_print_if_insecure(mitigation_options[i].option);
-
+ spec_v2_print_cond(mitigation_options[i].option,
+ mitigation_options[i].secure);
return cmd;
}
-static bool stibp_needed(void)
-{
- if (spectre_v2_enabled == SPECTRE_V2_NONE)
- return false;
-
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- return false;
-
- return true;
-}
-
-static void update_stibp_msr(void *info)
-{
- wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
-}
-
-void arch_smt_update(void)
-{
- u64 mask;
-
- if (!stibp_needed())
- return;
-
- mutex_lock(&spec_ctrl_mutex);
- mask = x86_spec_ctrl_base;
- if (cpu_smt_control == CPU_SMT_ENABLED)
- mask |= SPEC_CTRL_STIBP;
- else
- mask &= ~SPEC_CTRL_STIBP;
-
- if (mask != x86_spec_ctrl_base) {
- pr_info("Spectre v2 cross-process SMT mitigation: %s STIBP\n",
- cpu_smt_control == CPU_SMT_ENABLED ?
- "Enabling" : "Disabling");
- x86_spec_ctrl_base = mask;
- on_each_cpu(update_stibp_msr, NULL, 1);
- }
- mutex_unlock(&spec_ctrl_mutex);
-}
-
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -417,14 +520,12 @@ retpoline_auto:
pr_err("Spectre mitigation: LFENCE not serializing, switching to generic retpoline\n");
goto retpoline_generic;
}
- mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
- SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
+ mode = SPECTRE_V2_RETPOLINE_AMD;
setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
} else {
retpoline_generic:
- mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
- SPECTRE_V2_RETPOLINE_MINIMAL;
+ mode = SPECTRE_V2_RETPOLINE_GENERIC;
setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
}
@@ -443,12 +544,6 @@ specv2_set_mode:
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
- /* Initialize Indirect Branch Prediction Barrier if supported */
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
- pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
- }
-
/*
* Retpoline means the kernel is safe because it has no indirect
* branches. Enhanced IBRS protects firmware too, so, enable restricted
@@ -465,10 +560,67 @@ specv2_set_mode:
pr_info("Enabling Restricted Speculation for firmware calls\n");
}
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
+ spectre_v2_user_select_mitigation(cmd);
+
/* Enable STIBP if appropriate */
arch_smt_update();
}
+static void update_stibp_msr(void * __unused)
+{
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
+ else
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+void arch_smt_update(void)
+{
+ /* Enhanced IBRS implies STIBP. No update required. */
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return;
+
+ mutex_lock(&spec_ctrl_mutex);
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
#undef pr_fmt
#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
@@ -483,7 +635,7 @@ enum ssb_mitigation_cmd {
SPEC_STORE_BYPASS_CMD_SECCOMP,
};
-static const char *ssb_strings[] = {
+static const char * const ssb_strings[] = {
[SPEC_STORE_BYPASS_NONE] = "Vulnerable",
[SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
[SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
@@ -493,7 +645,7 @@ static const char *ssb_strings[] = {
static const struct {
const char *option;
enum ssb_mitigation_cmd cmd;
-} ssb_mitigation_options[] = {
+} ssb_mitigation_options[] __initdata = {
{ "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
{ "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
{ "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
@@ -604,10 +756,25 @@ static void ssb_select_mitigation(void)
#undef pr_fmt
#define pr_fmt(fmt) "Speculation prctl: " fmt
-static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+static void task_update_spec_tif(struct task_struct *tsk)
{
- bool update;
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
return -ENXIO;
@@ -618,28 +785,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
if (task_spec_ssb_force_disable(task))
return -EPERM;
task_clear_spec_ssb_disable(task);
- update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
case PR_SPEC_DISABLE:
task_set_spec_ssb_disable(task);
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
case PR_SPEC_FORCE_DISABLE:
task_set_spec_ssb_disable(task);
task_set_spec_ssb_force_disable(task);
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
+ task_update_spec_tif(task);
break;
default:
return -ERANGE;
}
+ return 0;
+}
- /*
- * If being set on non-current task, delay setting the CPU
- * mitigation until it is next scheduled.
- */
- if (task == current && update)
- speculative_store_bypass_update_current();
-
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return 0;
+ /*
+ * Indirect branch speculation is always disabled in strict
+ * mode.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return -EPERM;
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
+ return 0;
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
return 0;
}
@@ -649,6 +844,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
default:
return -ENODEV;
}
@@ -659,6 +856,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
{
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
}
#endif
@@ -681,11 +880,35 @@ static int ssb_prctl_get(struct task_struct *task)
}
}
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ case SPECTRE_V2_USER_STRICT:
+ return PR_SPEC_DISABLE;
+ default:
+ return PR_SPEC_NOT_AFFECTED;
+ }
+}
+
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
switch (which) {
case PR_SPEC_STORE_BYPASS:
return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
default:
return -ENODEV;
}
@@ -823,7 +1046,7 @@ early_param("l1tf", l1tf_cmdline);
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
#if IS_ENABLED(CONFIG_KVM_INTEL)
-static const char *l1tf_vmx_states[] = {
+static const char * const l1tf_vmx_states[] = {
[VMENTER_L1D_FLUSH_AUTO] = "auto",
[VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
[VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
@@ -839,13 +1062,14 @@ static ssize_t l1tf_show_state(char *buf)
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
- cpu_smt_control == CPU_SMT_ENABLED))
+ sched_smt_active())) {
return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
l1tf_vmx_states[l1tf_vmx_mitigation],
- cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
+ sched_smt_active() ? "vulnerable" : "disabled");
}
#else
static ssize_t l1tf_show_state(char *buf)
@@ -854,11 +1078,39 @@ static ssize_t l1tf_show_state(char *buf)
}
#endif
+static char *stibp_state(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ return "";
+
+ switch (spectre_v2_user) {
+ case SPECTRE_V2_USER_NONE:
+ return ", STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return ", STIBP: forced";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return ", STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return ", IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return ", IBPB: conditional";
+ return ", IBPB: disabled";
+ }
+ return "";
+}
+
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
- int ret;
-
if (!boot_cpu_has_bug(bug))
return sprintf(buf, "Not affected\n");
@@ -876,13 +1128,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
case X86_BUG_SPECTRE_V2:
- ret = sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
- boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+ return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- (x86_spec_ctrl_base & SPEC_CTRL_STIBP) ? ", STIBP" : "",
+ stibp_state(),
boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
spectre_v2_module_string());
- return ret;
case X86_BUG_SPEC_STORE_BYPASS:
return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index dd33c357548f..e12454e21b8a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -56,7 +56,7 @@
/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
-static bool thresholding_en;
+static bool thresholding_irq_en;
static const char * const th_names[] = {
"load_store",
@@ -534,9 +534,8 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
set_offset:
offset = setup_APIC_mce_threshold(offset, new);
-
- if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
- mce_threshold_vector = amd_threshold_interrupt;
+ if (offset == new)
+ thresholding_irq_en = true;
done:
mce_threshold_block_init(&b, offset);
@@ -1357,9 +1356,6 @@ int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
- if (!thresholding_en)
- return 0;
-
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
@@ -1377,9 +1373,6 @@ int mce_threshold_create_device(unsigned int cpu)
struct threshold_bank **bp;
int err = 0;
- if (!thresholding_en)
- return 0;
-
bp = per_cpu(threshold_banks, cpu);
if (bp)
return 0;
@@ -1408,9 +1401,6 @@ static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
- if (mce_threshold_vector == amd_threshold_interrupt)
- thresholding_en = true;
-
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
int err = mce_threshold_create_device(lcpu);
@@ -1419,6 +1409,9 @@ static __init int threshold_init_device(void)
return err;
}
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+
return 0;
}
/*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 61a949d84dfa..d99a8ee9e185 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -344,10 +344,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
}
+ local_bh_disable();
fpu->initialized = 1;
- preempt_disable();
fpu__restore(fpu);
- preempt_enable();
+ local_bh_enable();
return err;
} else {
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 01ebcb6f263e..7ee8067cbf45 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -994,7 +994,6 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
{
unsigned long old;
int faulted;
- struct ftrace_graph_ent trace;
unsigned long return_hooker = (unsigned long)
&return_to_handler;
@@ -1046,19 +1045,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
return;
}
- trace.func = self_addr;
- trace.depth = current->curr_ret_stack + 1;
-
- /* Only trace if the calling function expects to */
- if (!ftrace_graph_entry(&trace)) {
+ if (function_graph_enter(old, self_addr, frame_pointer, parent))
*parent = old;
- return;
- }
-
- if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer, parent) == -EBUSY) {
- *parent = old;
- return;
- }
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 76fa3b836598..ec6fefbfd3c0 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -37,7 +37,6 @@ asmlinkage __visible void __init i386_start_kernel(void)
cr4_init_shadow();
sanitize_boot_params(&boot_params);
- x86_verify_bootdata_version();
x86_early_init_platform_quirks();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7663a8eb602b..16b1cbd3a61e 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -457,8 +457,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
if (!boot_params.hdr.version)
copy_bootdata(__va(real_mode_data));
- x86_verify_bootdata_version();
-
x86_early_init_platform_quirks();
switch (boot_params.hdr.hardware_subarch) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c93fcfdf1673..7d31192296a8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -40,6 +40,8 @@
#include <asm/prctl.h>
#include <asm/spec-ctrl.h>
+#include "process.h"
+
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -252,11 +254,12 @@ void arch_setup_new_exec(void)
enable_cpuid();
}
-static inline void switch_to_bitmap(struct tss_struct *tss,
- struct thread_struct *prev,
+static inline void switch_to_bitmap(struct thread_struct *prev,
struct thread_struct *next,
unsigned long tifp, unsigned long tifn)
{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+
if (tifn & _TIF_IO_BITMAP) {
/*
* Copy the relevant range of the IO bitmap.
@@ -395,32 +398,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
}
-static __always_inline void intel_set_ssb_state(unsigned long tifn)
+/*
+ * Update the MSRs managing speculation control, during context switch.
+ *
+ * tifp: Previous task's thread flags
+ * tifn: Next task's thread flags
+ */
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
+ unsigned long tifn)
{
- u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
+ unsigned long tif_diff = tifp ^ tifn;
+ u64 msr = x86_spec_ctrl_base;
+ bool updmsr = false;
+
+ /*
+ * If TIF_SSBD is different, select the proper mitigation
+ * method. Note that if SSBD mitigation is disabled or permanentely
+ * enabled this branch can't be taken because nothing can set
+ * TIF_SSBD.
+ */
+ if (tif_diff & _TIF_SSBD) {
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ amd_set_ssb_virt_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
+ amd_set_core_ssb_state(tifn);
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
+ updmsr = true;
+ }
+ }
+
+ /*
+ * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
+ * otherwise avoid the MSR write.
+ */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ static_branch_unlikely(&switch_to_cond_stibp)) {
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
+ msr |= stibp_tif_to_spec_ctrl(tifn);
+ }
- wrmsrl(MSR_IA32_SPEC_CTRL, msr);
+ if (updmsr)
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
}
-static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
{
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
- amd_set_ssb_virt_state(tifn);
- else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
- amd_set_core_ssb_state(tifn);
- else
- intel_set_ssb_state(tifn);
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
+ if (task_spec_ssb_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SSBD);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
+
+ if (task_spec_ib_disable(tsk))
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ else
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
+ }
+ /* Return the updated threadinfo flags*/
+ return task_thread_info(tsk)->flags;
}
-void speculative_store_bypass_update(unsigned long tif)
+void speculation_ctrl_update(unsigned long tif)
{
+ /* Forced update. Make sure all relevant TIF flags are different */
preempt_disable();
- __speculative_store_bypass_update(tif);
+ __speculation_ctrl_update(~tif, tif);
preempt_enable();
}
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
+/* Called from seccomp/prctl update */
+void speculation_ctrl_update_current(void)
+{
+ preempt_disable();
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
+ preempt_enable();
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev, *next;
unsigned long tifp, tifn;
@@ -430,7 +486,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
- switch_to_bitmap(tss, prev, next, tifp, tifn);
+ switch_to_bitmap(prev, next, tifp, tifn);
propagate_user_return_notify(prev_p, next_p);
@@ -451,8 +507,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
if ((tifp ^ tifn) & _TIF_NOCPUID)
set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
- if ((tifp ^ tifn) & _TIF_SSBD)
- __speculative_store_bypass_update(tifn);
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
+ __speculation_ctrl_update(tifp, tifn);
+ } else {
+ speculation_ctrl_update_tif(prev_p);
+ tifn = speculation_ctrl_update_tif(next_p);
+
+ /* Enforce MSR update to ensure consistent state */
+ __speculation_ctrl_update(~tifn, tifn);
+ }
}
/*
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
new file mode 100644
index 000000000000..898e97cf6629
--- /dev/null
+++ b/arch/x86/kernel/process.h
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Code shared between 32 and 64 bit
+
+#include <asm/spec-ctrl.h>
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
+
+/*
+ * This needs to be inline to optimize for the common case where no extra
+ * work needs to be done.
+ */
+static inline void switch_to_extra(struct task_struct *prev,
+ struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long prev_tif = task_thread_info(prev)->flags;
+
+ if (IS_ENABLED(CONFIG_SMP)) {
+ /*
+ * Avoid __switch_to_xtra() invocation when conditional
+ * STIPB is disabled and the only different bit is
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
+ * in the TIF_WORK_CTXSW masks.
+ */
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
+ prev_tif &= ~_TIF_SPEC_IB;
+ next_tif &= ~_TIF_SPEC_IB;
+ }
+ }
+
+ /*
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
+ * speculation mitigations etc.
+ */
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
+ prev_tif & _TIF_WORK_CTXSW_PREV))
+ __switch_to_xtra(prev, next);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5046a3c9dec2..d3e593eb189f 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -59,6 +59,8 @@
#include <asm/intel_rdt_sched.h>
#include <asm/proto.h>
+#include "process.h"
+
void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -232,7 +234,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -264,12 +265,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
set_iopl_mask(next->iopl);
- /*
- * Now maybe handle debug registers and/or IO bitmaps
- */
- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
- __switch_to_xtra(prev_p, next_p, tss);
+ switch_to_extra(prev_p, next_p);
/*
* Leave lazy mode, flushing any hypercalls made here.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0e0b4288a4b2..bbfbf017065c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -60,6 +60,8 @@
#include <asm/unistd_32_ia32.h>
#endif
+#include "process.h"
+
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
{
@@ -553,7 +555,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *prev_fpu = &prev->fpu;
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
@@ -617,12 +618,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload sp0. */
update_task_stack(next_p);
- /*
- * Now maybe reload the debug registers and handle I/O bitmaps
- */
- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
- __switch_to_xtra(prev_p, next_p, tss);
+ switch_to_extra(prev_p, next_p);
#ifdef CONFIG_XEN_PV
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b74e7bfed6ab..d494b9bfe618 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1280,23 +1280,6 @@ void __init setup_arch(char **cmdline_p)
unwind_init();
}
-/*
- * From boot protocol 2.14 onwards we expect the bootloader to set the
- * version to "0x8000 | <used version>". In case we find a version >= 2.14
- * without the 0x8000 we assume the boot loader supports 2.13 only and
- * reset the version accordingly. The 0x8000 flag is removed in any case.
- */
-void __init x86_verify_bootdata_version(void)
-{
- if (boot_params.hdr.version & VERSION_WRITTEN)
- boot_params.hdr.version &= ~VERSION_WRITTEN;
- else if (boot_params.hdr.version >= 0x020e)
- boot_params.hdr.version = 0x020d;
-
- if (boot_params.hdr.version < 0x020e)
- boot_params.hdr.acpi_rsdp_addr = 0;
-}
-
#ifdef CONFIG_X86_32
static struct resource video_ram_resource = {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 89db20f8cb70..c4533d05c214 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -55,7 +55,7 @@
#define PRIo64 "o"
/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */
-#define apic_debug(fmt, arg...)
+#define apic_debug(fmt, arg...) do {} while (0)
/* 14 is the version for Xeon and Pentium 8.4.8*/
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
@@ -576,6 +576,11 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
+ if (unlikely(!map)) {
+ count = -EOPNOTSUPP;
+ goto out;
+ }
+
if (min > map->max_apic_id)
goto out;
/* Bits above cluster_size are masked in the caller. */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cf5f572f2305..7c03c0f35444 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5074,9 +5074,9 @@ static bool need_remote_flush(u64 old, u64 new)
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
- const u8 *new, int *bytes)
+ int *bytes)
{
- u64 gentry;
+ u64 gentry = 0;
int r;
/*
@@ -5088,22 +5088,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
}
- switch (*bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
+ if (*bytes == 4 || *bytes == 8) {
+ r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
+ if (r)
+ gentry = 0;
}
return gentry;
@@ -5207,8 +5197,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
-
/*
* No need to care whether allocation memory is successful
* or not since pte prefetch is skiped if it does not have
@@ -5217,6 +5205,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mmu_topup_memory_caches(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
+
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
+
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e21ccc46792..cc6467b35a85 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1446,7 +1446,7 @@ static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
-static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
@@ -1464,6 +1464,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ return svm->vmcb->control.tsc_offset;
}
static void avic_init_vmcb(struct vcpu_svm *svm)
@@ -1664,20 +1665,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
static int avic_init_access_page(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- int ret;
+ int ret = 0;
+ mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page_done)
- return 0;
+ goto out;
- ret = x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
+ ret = __x86_set_memory_region(kvm,
+ APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE,
+ PAGE_SIZE);
if (ret)
- return ret;
+ goto out;
kvm->arch.apic_access_page_done = true;
- return 0;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
}
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
@@ -2189,21 +2193,31 @@ out:
return ERR_PTR(err);
}
+static void svm_clear_current_vmcb(struct vmcb *vmcb)
+{
+ int i;
+
+ for_each_online_cpu(i)
+ cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
+}
+
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * The vmcb page can be recycled, causing a false negative in
+ * svm_vcpu_load(). So, ensure that no logical CPU has this
+ * vmcb page recorded as its current vmcb.
+ */
+ svm_clear_current_vmcb(svm->vmcb);
+
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So do a full IBPB now.
- */
- indirect_branch_prediction_barrier();
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -7149,7 +7163,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.read_l1_tsc_offset = svm_read_l1_tsc_offset,
- .write_tsc_offset = svm_write_tsc_offset,
+ .write_l1_tsc_offset = svm_write_l1_tsc_offset,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4555077d69ce..02edd9960e9d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -174,6 +174,7 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
+module_param(ple_gap, uint, 0444);
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
@@ -984,6 +985,7 @@ struct vcpu_vmx {
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
+ bool guest_msrs_dirty;
unsigned long host_idt_base;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
@@ -1306,7 +1308,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
u16 error_code);
static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
-static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -1610,12 +1612,6 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /* We don't support disabling the feature for simplicity. */
- if (vmx->nested.enlightened_vmcs_enabled)
- return 0;
-
- vmx->nested.enlightened_vmcs_enabled = true;
-
/*
* vmcs_version represents the range of supported Enlightened VMCS
* versions: lower 8 bits is the minimal version, higher 8 bits is the
@@ -1625,6 +1621,12 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
if (vmcs_version)
*vmcs_version = (KVM_EVMCS_VERSION << 8) | 1;
+ /* We don't support disabling the feature for simplicity. */
+ if (vmx->nested.enlightened_vmcs_enabled)
+ return 0;
+
+ vmx->nested.enlightened_vmcs_enabled = true;
+
vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
@@ -2897,6 +2899,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->req_immediate_exit = false;
+ /*
+ * Note that guest MSRs to be saved/restored can also be changed
+ * when guest state is loaded. This happens when guest transitions
+ * to/from long-mode by setting MSR_EFER.LMA.
+ */
+ if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
+ vmx->guest_msrs_dirty = false;
+ for (i = 0; i < vmx->save_nmsrs; ++i)
+ kvm_set_shared_msr(vmx->guest_msrs[i].index,
+ vmx->guest_msrs[i].data,
+ vmx->guest_msrs[i].mask);
+
+ }
+
if (vmx->loaded_cpu_state)
return;
@@ -2957,11 +2973,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmcs_writel(HOST_GS_BASE, gs_base);
host_state->gs_base = gs_base;
}
-
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
@@ -3436,6 +3447,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
+ vmx->guest_msrs_dirty = true;
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
@@ -3452,11 +3464,9 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
return vcpu->arch.tsc_offset;
}
-/*
- * writes 'offset' into guest's timestamp counter offset register
- */
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
+ u64 active_offset = offset;
if (is_guest_mode(vcpu)) {
/*
* We're here if L1 chose not to trap WRMSR to TSC. According
@@ -3464,17 +3474,16 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* set for L2 remains unchanged, and still needs to be added
* to the newly set TSC to get L2's TSC.
*/
- struct vmcs12 *vmcs12;
- /* recalculate vmcs02.TSC_OFFSET: */
- vmcs12 = get_vmcs12(vcpu);
- vmcs_write64(TSC_OFFSET, offset +
- (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
- vmcs12->tsc_offset : 0));
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING))
+ active_offset += vmcs12->tsc_offset;
} else {
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vmcs_read64(TSC_OFFSET), offset);
- vmcs_write64(TSC_OFFSET, offset);
}
+
+ vmcs_write64(TSC_OFFSET, active_offset);
+ return active_offset;
}
/*
@@ -5944,7 +5953,7 @@ static void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
-static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -5982,7 +5991,7 @@ static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bit
}
}
-static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type)
{
int f = sizeof(unsigned long);
@@ -6020,7 +6029,7 @@ static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitm
}
}
-static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type, bool value)
{
if (value)
@@ -8664,8 +8673,6 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
- vmcs12->hdr.revision_id = evmcs->revision_id;
-
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
@@ -9369,7 +9376,30 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page);
- if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) {
+ /*
+ * Currently, KVM only supports eVMCS version 1
+ * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
+ * value to first u32 field of eVMCS which should specify eVMCS
+ * VersionNumber.
+ *
+ * Guest should be aware of supported eVMCS versions by host by
+ * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
+ * expected to set this CPUID leaf according to the value
+ * returned in vmcs_version from nested_enable_evmcs().
+ *
+ * However, it turns out that Microsoft Hyper-V fails to comply
+ * to their own invented interface: When Hyper-V use eVMCS, it
+ * just sets first u32 field of eVMCS to revision_id specified
+ * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
+ * which is one of the supported versions specified in
+ * CPUID.0x4000000A.EAX[0:15].
+ *
+ * To overcome Hyper-V bug, we accept here either a supported
+ * eVMCS version or VMCS12 revision_id as valid values for first
+ * u32 field of eVMCS.
+ */
+ if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
+ (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu);
return 0;
}
@@ -9390,9 +9420,11 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
* present in struct hv_enlightened_vmcs, ...). Make sure there
* are no leftovers.
*/
- if (from_launch)
- memset(vmx->nested.cached_vmcs12, 0,
- sizeof(*vmx->nested.cached_vmcs12));
+ if (from_launch) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ memset(vmcs12, 0, sizeof(*vmcs12));
+ vmcs12->hdr.revision_id = VMCS12_REVISION;
+ }
}
return 1;
@@ -15062,7 +15094,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.read_l1_tsc_offset = vmx_read_l1_tsc_offset,
- .write_tsc_offset = vmx_write_tsc_offset,
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5cd5647120f2..d02937760c3b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1665,8 +1665,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
- vcpu->arch.tsc_offset = offset;
+ vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -1794,7 +1793,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
- kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
+ u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -6918,6 +6918,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
clock_pairing.nsec = ts.tv_nsec;
clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
clock_pairing.flags = 0;
+ memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
ret = 0;
if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
@@ -7455,7 +7456,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
else {
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+ if (ioapic_in_kernel(vcpu->kvm))
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
if (is_guest_mode(vcpu))
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index bddd6b3cee1d..03b6b4c2238d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
-#include <linux/ptrace.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -31,6 +30,12 @@
*/
/*
+ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_ibpb.
+ */
+#define LAST_USER_MM_IBPB 0x1UL
+
+/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
@@ -181,17 +186,87 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
}
-static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
+static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+
+ return (unsigned long)next->mm | ibpb;
+}
+
+static void cond_ibpb(struct task_struct *next)
{
+ if (!next || !next->mm)
+ return;
+
/*
- * Check if the current (previous) task has access to the memory
- * of the @tsk (next) task. If access is denied, make sure to
- * issue a IBPB to stop user->user Spectre-v2 attacks.
- *
- * Note: __ptrace_may_access() returns 0 or -ERRNO.
+ * Both, the conditional and the always IBPB mode use the mm
+ * pointer to avoid the IBPB when switching between tasks of the
+ * same process. Using the mm pointer instead of mm->context.ctx_id
+ * opens a hypothetical hole vs. mm_struct reuse, which is more or
+ * less impossible to control by an attacker. Aside of that it
+ * would only affect the first schedule so the theoretically
+ * exposed data is not really interesting.
*/
- return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
- ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
+ if (static_branch_likely(&switch_mm_cond_ibpb)) {
+ unsigned long prev_mm, next_mm;
+
+ /*
+ * This is a bit more complex than the always mode because
+ * it has to handle two cases:
+ *
+ * 1) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB set to a user space task
+ * (potential victim) which has TIF_SPEC_IB not set.
+ *
+ * 2) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB not set to a user space task
+ * (potential victim) which has TIF_SPEC_IB set.
+ *
+ * This could be done by unconditionally issuing IBPB when
+ * a task which has TIF_SPEC_IB set is either scheduled in
+ * or out. Though that results in two flushes when:
+ *
+ * - the same user space task is scheduled out and later
+ * scheduled in again and only a kernel thread ran in
+ * between.
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in after a kernel thread ran in between
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in immediately.
+ *
+ * Optimize this with reasonably small overhead for the
+ * above cases. Mangle the TIF_SPEC_IB bit into the mm
+ * pointer of the incoming task which is stored in
+ * cpu_tlbstate.last_user_mm_ibpb for comparison.
+ */
+ next_mm = mm_mangle_tif_spec_ib(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
+
+ /*
+ * Issue IBPB only if the mm's are different and one or
+ * both have the IBPB bit set.
+ */
+ if (next_mm != prev_mm &&
+ (next_mm | prev_mm) & LAST_USER_MM_IBPB)
+ indirect_branch_prediction_barrier();
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
+ }
+
+ if (static_branch_unlikely(&switch_mm_always_ibpb)) {
+ /*
+ * Only flush when switching to a user space task with a
+ * different context than the user space task which ran
+ * last on this CPU.
+ */
+ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
+ }
+ }
}
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
@@ -292,22 +367,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
new_asid = prev_asid;
need_flush = true;
} else {
- u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
-
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
- *
- * As an optimization, flush indirect branches only when
- * switching into a processes that can't be ptrace by the
- * current one (as in such case, attacker has much more
- * convenient way how to tamper with the next process than
- * branch buffer poisoning).
*/
- if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
- ibpb_needed(tsk, last_ctx_id))
- indirect_branch_prediction_barrier();
+ cond_ibpb(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -365,14 +430,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
- /*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
- */
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
/* Make sure we write CR3 before loaded_mm. */
barrier();
@@ -441,7 +498,7 @@ void initialize_tlbstate_and_flush(void)
write_cr3(build_cr3(mm->pgd, 0));
/* Reinitialize tlbstate. */
- this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e996e8e744cb..750f46ad018a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -10,7 +10,6 @@
#include <xen/xen.h>
#include <xen/features.h>
#include <xen/page.h>
-#include <xen/interface/memory.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -346,80 +345,3 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
-
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-void __init arch_xen_balloon_init(struct resource *hostmem_resource)
-{
- struct xen_memory_map memmap;
- int rc;
- unsigned int i, last_guest_ram;
- phys_addr_t max_addr = PFN_PHYS(max_pfn);
- struct e820_table *xen_e820_table;
- const struct e820_entry *entry;
- struct resource *res;
-
- if (!xen_initial_domain())
- return;
-
- xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL);
- if (!xen_e820_table)
- return;
-
- memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries);
- set_xen_guest_handle(memmap.buffer, xen_e820_table->entries);
- rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap);
- if (rc) {
- pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc);
- goto out;
- }
-
- last_guest_ram = 0;
- for (i = 0; i < memmap.nr_entries; i++) {
- if (xen_e820_table->entries[i].addr >= max_addr)
- break;
- if (xen_e820_table->entries[i].type == E820_TYPE_RAM)
- last_guest_ram = i;
- }
-
- entry = &xen_e820_table->entries[last_guest_ram];
- if (max_addr >= entry->addr + entry->size)
- goto out; /* No unallocated host RAM. */
-
- hostmem_resource->start = max_addr;
- hostmem_resource->end = entry->addr + entry->size;
-
- /*
- * Mark non-RAM regions between the end of dom0 RAM and end of host RAM
- * as unavailable. The rest of that region can be used for hotplug-based
- * ballooning.
- */
- for (; i < memmap.nr_entries; i++) {
- entry = &xen_e820_table->entries[i];
-
- if (entry->type == E820_TYPE_RAM)
- continue;
-
- if (entry->addr >= hostmem_resource->end)
- break;
-
- res = kzalloc(sizeof(*res), GFP_KERNEL);
- if (!res)
- goto out;
-
- res->name = "Unavailable host RAM";
- res->start = entry->addr;
- res->end = (entry->addr + entry->size < hostmem_resource->end) ?
- entry->addr + entry->size : hostmem_resource->end;
- rc = insert_resource(hostmem_resource, res);
- if (rc) {
- pr_warn("%s: Can't insert [%llx - %llx) (%d)\n",
- __func__, res->start, res->end, rc);
- kfree(res);
- goto out;
- }
- }
-
- out:
- kfree(xen_e820_table);
-}
-#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 2bce7958ce8b..0766a08bdf45 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -69,6 +69,11 @@ void xen_mc_flush(void)
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+#if MC_DEBUG
+ memcpy(b->debug, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+#endif
+
switch (b->mcidx) {
case 0:
/* no-op */
@@ -87,32 +92,34 @@ void xen_mc_flush(void)
break;
default:
-#if MC_DEBUG
- memcpy(b->debug, b->entries,
- b->mcidx * sizeof(struct multicall_entry));
-#endif
-
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
+ }
+ if (WARN_ON(ret)) {
+ pr_err("%d of %d multicall(s) failed: cpu %d\n",
+ ret, b->mcidx, smp_processor_id());
+ for (i = 0; i < b->mcidx; i++) {
+ if (b->entries[i].result < 0) {
#if MC_DEBUG
- if (ret) {
- printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
- ret, smp_processor_id());
- dump_stack();
- for (i = 0; i < b->mcidx; i++) {
- printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
- i+1, b->mcidx,
+ pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n",
+ i + 1,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result,
b->caller[i]);
+#else
+ pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
+ i + 1,
+ b->entries[i].op,
+ b->entries[i].args[0],
+ b->entries[i].result);
+#endif
}
}
-#endif
}
b->mcidx = 0;
@@ -126,8 +133,6 @@ void xen_mc_flush(void)
b->cbidx = 0;
local_irq_restore(flags);
-
- WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1163e33121fb..075ed47993bb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -808,6 +808,7 @@ char * __init xen_memory_setup(void)
addr = xen_e820_table.entries[0].addr;
size = xen_e820_table.entries[0].size;
while (i < xen_e820_table.nr_entries) {
+ bool discard = false;
chunk_size = size;
type = xen_e820_table.entries[i].type;
@@ -823,10 +824,11 @@ char * __init xen_memory_setup(void)
xen_add_extra_mem(pfn_s, n_pfns);
xen_max_p2m_pfn = pfn_s + n_pfns;
} else
- type = E820_TYPE_UNUSABLE;
+ discard = true;
}
- xen_align_and_add_e820_region(addr, chunk_size, type);
+ if (!discard)
+ xen_align_and_add_e820_region(addr, chunk_size, type);
addr += chunk_size;
size -= chunk_size;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 1c8a8816a402..3776122c87cc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -3,22 +3,17 @@
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
-#include <linux/kernel_stat.h>
+#include <linux/kernel.h>
#include <linux/spinlock.h>
-#include <linux/debugfs.h>
-#include <linux/log2.h>
-#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <asm/paravirt.h>
#include <asm/qspinlock.h>
-#include <xen/interface/xen.h>
#include <xen/events.h>
#include "xen-ops.h"
-#include "debugfs.h"
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);