aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts5
-rw-r--r--arch/arm/include/asm/xen/xen-ops.h2
-rw-r--r--arch/arm/mm/dma-mapping.c7
-rw-r--r--arch/arm/xen/enlighten.c2
-rw-r--r--arch/arm64/include/asm/kvm_host.h5
-rw-r--r--arch/arm64/include/asm/sysreg.h4
-rw-r--r--arch/arm64/include/asm/virt.h3
-rw-r--r--arch/arm64/include/asm/xen/xen-ops.h2
-rw-r--r--arch/arm64/kernel/cpufeature.c11
-rw-r--r--arch/arm64/kernel/entry-ftrace.S1
-rw-r--r--arch/arm64/kernel/fpsimd.c20
-rw-r--r--arch/arm64/kernel/ftrace.c137
-rw-r--r--arch/arm64/kernel/mte.c6
-rw-r--r--arch/arm64/kernel/setup.c7
-rw-r--r--arch/arm64/kvm/arch_timer.c3
-rw-r--r--arch/arm64/kvm/arm.c10
-rw-r--r--arch/arm64/kvm/fpsimd.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c42
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v3.c40
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c40
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.h3
-rw-r--r--arch/arm64/kvm/vmid.c2
-rw-r--r--arch/arm64/mm/cache.S2
-rw-r--r--arch/arm64/mm/dma-mapping.c7
-rw-r--r--arch/arm64/net/bpf_jit_comp.c1
-rwxr-xr-xarch/arm64/tools/gen-sysreg.awk2
-rw-r--r--arch/loongarch/Kconfig1
-rw-r--r--arch/loongarch/include/asm/hardirq.h2
-rw-r--r--arch/loongarch/include/asm/percpu.h1
-rw-r--r--arch/loongarch/include/asm/smp.h23
-rw-r--r--arch/loongarch/include/asm/timex.h7
-rw-r--r--arch/loongarch/kernel/acpi.c4
-rw-r--r--arch/loongarch/kernel/cacheinfo.c1
-rw-r--r--arch/loongarch/kernel/irq.c7
-rw-r--r--arch/loongarch/kernel/process.c14
-rw-r--r--arch/loongarch/kernel/setup.c5
-rw-r--r--arch/loongarch/kernel/smp.c2
-rw-r--r--arch/loongarch/kernel/vmlinux.lds.S1
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/thread_info.h10
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace-fpu.c20
-rw-r--r--arch/powerpc/kernel/ptrace/ptrace.c3
-rw-r--r--arch/powerpc/kernel/rtas.c4
-rw-r--r--arch/powerpc/kexec/crash.c2
-rw-r--r--arch/powerpc/mm/nohash/kaslr_booke.c8
-rw-r--r--arch/powerpc/platforms/powernv/Makefile1
-rw-r--r--arch/powerpc/platforms/pseries/papr_scm.c3
-rw-r--r--arch/riscv/Kconfig9
-rw-r--r--arch/riscv/Kconfig.erratas1
-rw-r--r--arch/riscv/boot/dts/microchip/mpfs.dtsi9
-rw-r--r--arch/riscv/kernel/cpufeature.c5
-rw-r--r--arch/riscv/kvm/vmid.c2
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/Makefile10
-rw-r--r--arch/s390/mm/init.c13
-rw-r--r--arch/um/drivers/virt-pci.c7
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/coco/tdx/tdx.c187
-rw-r--r--arch/x86/hyperv/hv_init.c6
-rw-r--r--arch/x86/hyperv/ivm.c84
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/e820/api.h5
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h70
-rw-r--r--arch/x86/include/asm/mshyperv.h4
-rw-r--r--arch/x86/include/asm/msr-index.h25
-rw-r--r--arch/x86/include/asm/nospec-branch.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h8
-rw-r--r--arch/x86/include/asm/setup.h38
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/cpu/bugs.c235
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/ftrace_64.S11
-rw-r--r--arch/x86/kernel/resource.c14
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kvm/lapic.c27
-rw-r--r--arch/x86/kvm/mmu/mmu.c4
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c9
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h1
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c38
-rw-r--r--arch/x86/kvm/svm/avic.c171
-rw-r--r--arch/x86/kvm/svm/nested.c43
-rw-r--r--arch/x86/kvm/svm/svm.c42
-rw-r--r--arch/x86/kvm/svm/svm.h6
-rw-r--r--arch/x86/kvm/vmx/vmx.c77
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c53
-rw-r--r--arch/x86/kvm/xen.h6
-rw-r--r--arch/x86/mm/mem_encrypt.c7
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c4
-rw-r--r--arch/x86/pci/acpi.c8
-rw-r--r--arch/x86/xen/enlighten_hvm.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c2
99 files changed, 1249 insertions, 572 deletions
diff --git a/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts b/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts
index 443e8b022897..14af1fd6d247 100644
--- a/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts
+++ b/arch/arm/boot/dts/at91-sama5d3_ksz9477_evb.dts
@@ -120,26 +120,31 @@
port@0 {
reg = <0>;
label = "lan1";
+ phy-mode = "internal";
};
port@1 {
reg = <1>;
label = "lan2";
+ phy-mode = "internal";
};
port@2 {
reg = <2>;
label = "lan3";
+ phy-mode = "internal";
};
port@3 {
reg = <3>;
label = "lan4";
+ phy-mode = "internal";
};
port@4 {
reg = <4>;
label = "lan5";
+ phy-mode = "internal";
};
port@5 {
diff --git a/arch/arm/include/asm/xen/xen-ops.h b/arch/arm/include/asm/xen/xen-ops.h
new file mode 100644
index 000000000000..7ebb7eb0bd93
--- /dev/null
+++ b/arch/arm/include/asm/xen/xen-ops.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <xen/arm/xen-ops.h>
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 82ffac621854..059cce018570 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -33,7 +33,7 @@
#include <asm/dma-iommu.h>
#include <asm/mach/map.h>
#include <asm/system_info.h>
-#include <xen/swiotlb-xen.h>
+#include <asm/xen/xen-ops.h>
#include "dma.h"
#include "mm.h"
@@ -2287,10 +2287,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
set_dma_ops(dev, dma_ops);
-#ifdef CONFIG_XEN
- if (xen_initial_domain())
- dev->dma_ops = &xen_swiotlb_dma_ops;
-#endif
+ xen_setup_dma_ops(dev);
dev->archdata.dma_ops_setup = true;
}
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 07eb69f9e7df..1f9c3ba32833 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -443,6 +443,8 @@ static int __init xen_guest_init(void)
if (!xen_domain())
return 0;
+ xen_set_restricted_virtio_memory_access();
+
if (!acpi_disabled)
xen_acpi_guest_init();
else
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 47a1e25e25bb..de32152cea04 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -363,11 +363,6 @@ struct kvm_vcpu_arch {
struct kvm_pmu pmu;
/*
- * Anything that is not used directly from assembly code goes
- * here.
- */
-
- /*
* Guest registers we preserve during guest debugging.
*
* These shadow registers are updated by the kvm_handle_sys_reg
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 55f998c3dc28..42ff95dba6da 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -843,9 +843,9 @@
#define ID_AA64SMFR0_F32F32_SHIFT 32
#define ID_AA64SMFR0_FA64 0x1
-#define ID_AA64SMFR0_I16I64 0x4
+#define ID_AA64SMFR0_I16I64 0xf
#define ID_AA64SMFR0_F64F64 0x1
-#define ID_AA64SMFR0_I8I32 0x4
+#define ID_AA64SMFR0_I8I32 0xf
#define ID_AA64SMFR0_F16F32 0x1
#define ID_AA64SMFR0_B16F32 0x1
#define ID_AA64SMFR0_F32F32 0x1
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 3c8af033a997..0e80db4327b6 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -113,6 +113,9 @@ static __always_inline bool has_vhe(void)
/*
* Code only run in VHE/NVHE hyp context can assume VHE is present or
* absent. Otherwise fall back to caps.
+ * This allows the compiler to discard VHE-specific code from the
+ * nVHE object, reducing the number of external symbol references
+ * needed to link.
*/
if (is_vhe_hyp_code())
return true;
diff --git a/arch/arm64/include/asm/xen/xen-ops.h b/arch/arm64/include/asm/xen/xen-ops.h
new file mode 100644
index 000000000000..7ebb7eb0bd93
--- /dev/null
+++ b/arch/arm64/include/asm/xen/xen-ops.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <xen/arm/xen-ops.h>
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 42ea2bd856c6..8d88433de81d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1974,15 +1974,7 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
#ifdef CONFIG_KVM
static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
{
- if (kvm_get_mode() != KVM_MODE_PROTECTED)
- return false;
-
- if (is_kernel_in_hyp_mode()) {
- pr_warn("Protected KVM not available with VHE\n");
- return false;
- }
-
- return true;
+ return kvm_get_mode() == KVM_MODE_PROTECTED;
}
#endif /* CONFIG_KVM */
@@ -3109,7 +3101,6 @@ void cpu_set_feature(unsigned int num)
WARN_ON(num >= MAX_CPU_FEATURES);
elf_hwcap |= BIT(num);
}
-EXPORT_SYMBOL_GPL(cpu_set_feature);
bool cpu_have_feature(unsigned int num)
{
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index d42a205ef625..bd5df50e4643 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -102,7 +102,6 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
* x19-x29 per the AAPCS, and we created frame records upon entry, so we need
* to restore x0-x8, x29, and x30.
*/
-ftrace_common_return:
/* Restore function arguments */
ldp x0, x1, [sp]
ldp x2, x3, [sp, #S_X2]
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 819979398127..aecf3071efdd 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -331,7 +331,7 @@ void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
* trapping to the kernel.
*
* When stored, Z0-Z31 (incorporating Vn in bits[127:0] or the
- * corresponding Zn), P0-P15 and FFR are encoded in in
+ * corresponding Zn), P0-P15 and FFR are encoded in
* task->thread.sve_state, formatted appropriately for vector
* length task->thread.sve_vl or, if SVCR.SM is set,
* task->thread.sme_vl.
@@ -1916,10 +1916,15 @@ void __efi_fpsimd_begin(void)
if (system_supports_sme()) {
svcr = read_sysreg_s(SYS_SVCR);
- if (!system_supports_fa64())
- ffr = svcr & SVCR_SM_MASK;
+ __this_cpu_write(efi_sm_state,
+ svcr & SVCR_SM_MASK);
- __this_cpu_write(efi_sm_state, ffr);
+ /*
+ * Unless we have FA64 FFR does not
+ * exist in streaming mode.
+ */
+ if (!system_supports_fa64())
+ ffr = !(svcr & SVCR_SM_MASK);
}
sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()),
@@ -1964,8 +1969,13 @@ void __efi_fpsimd_end(void)
sysreg_clear_set_s(SYS_SVCR,
0,
SVCR_SM_MASK);
+
+ /*
+ * Unless we have FA64 FFR does not
+ * exist in streaming mode.
+ */
if (!system_supports_fa64())
- ffr = efi_sm_state;
+ ffr = false;
}
}
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index f447c4a36f69..ea5dc7c90f46 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -78,47 +78,76 @@ static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
}
/*
- * Turn on the call to ftrace_caller() in instrumented function
+ * Find the address the callsite must branch to in order to reach '*addr'.
+ *
+ * Due to the limited range of 'BL' instructions, modules may be placed too far
+ * away to branch directly and must use a PLT.
+ *
+ * Returns true when '*addr' contains a reachable target address, or has been
+ * modified to contain a PLT address. Returns false otherwise.
*/
-int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+static bool ftrace_find_callable_addr(struct dyn_ftrace *rec,
+ struct module *mod,
+ unsigned long *addr)
{
unsigned long pc = rec->ip;
- u32 old, new;
- long offset = (long)pc - (long)addr;
+ long offset = (long)*addr - (long)pc;
+ struct plt_entry *plt;
- if (offset < -SZ_128M || offset >= SZ_128M) {
- struct module *mod;
- struct plt_entry *plt;
+ /*
+ * When the target is within range of the 'BL' instruction, use 'addr'
+ * as-is and branch to that directly.
+ */
+ if (offset >= -SZ_128M && offset < SZ_128M)
+ return true;
- if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
- return -EINVAL;
+ /*
+ * When the target is outside of the range of a 'BL' instruction, we
+ * must use a PLT to reach it. We can only place PLTs for modules, and
+ * only when module PLT support is built-in.
+ */
+ if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+ return false;
- /*
- * On kernels that support module PLTs, the offset between the
- * branch instruction and its target may legally exceed the
- * range of an ordinary relative 'bl' opcode. In this case, we
- * need to branch via a trampoline in the module.
- *
- * NOTE: __module_text_address() must be called with preemption
- * disabled, but we can rely on ftrace_lock to ensure that 'mod'
- * retains its validity throughout the remainder of this code.
- */
+ /*
+ * 'mod' is only set at module load time, but if we end up
+ * dealing with an out-of-range condition, we can assume it
+ * is due to a module being loaded far away from the kernel.
+ *
+ * NOTE: __module_text_address() must be called with preemption
+ * disabled, but we can rely on ftrace_lock to ensure that 'mod'
+ * retains its validity throughout the remainder of this code.
+ */
+ if (!mod) {
preempt_disable();
mod = __module_text_address(pc);
preempt_enable();
+ }
- if (WARN_ON(!mod))
- return -EINVAL;
+ if (WARN_ON(!mod))
+ return false;
- plt = get_ftrace_plt(mod, addr);
- if (!plt) {
- pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
- return -EINVAL;
- }
-
- addr = (unsigned long)plt;
+ plt = get_ftrace_plt(mod, *addr);
+ if (!plt) {
+ pr_err("ftrace: no module PLT for %ps\n", (void *)*addr);
+ return false;
}
+ *addr = (unsigned long)plt;
+ return true;
+}
+
+/*
+ * Turn on the call to ftrace_caller() in instrumented function
+ */
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+ unsigned long pc = rec->ip;
+ u32 old, new;
+
+ if (!ftrace_find_callable_addr(rec, NULL, &addr))
+ return -EINVAL;
+
old = aarch64_insn_gen_nop();
new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
@@ -132,6 +161,11 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long pc = rec->ip;
u32 old, new;
+ if (!ftrace_find_callable_addr(rec, NULL, &old_addr))
+ return -EINVAL;
+ if (!ftrace_find_callable_addr(rec, NULL, &addr))
+ return -EINVAL;
+
old = aarch64_insn_gen_branch_imm(pc, old_addr,
AARCH64_INSN_BRANCH_LINK);
new = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
@@ -181,54 +215,15 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
unsigned long addr)
{
unsigned long pc = rec->ip;
- bool validate = true;
u32 old = 0, new;
- long offset = (long)pc - (long)addr;
- if (offset < -SZ_128M || offset >= SZ_128M) {
- u32 replaced;
-
- if (!IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
- return -EINVAL;
-
- /*
- * 'mod' is only set at module load time, but if we end up
- * dealing with an out-of-range condition, we can assume it
- * is due to a module being loaded far away from the kernel.
- */
- if (!mod) {
- preempt_disable();
- mod = __module_text_address(pc);
- preempt_enable();
-
- if (WARN_ON(!mod))
- return -EINVAL;
- }
-
- /*
- * The instruction we are about to patch may be a branch and
- * link instruction that was redirected via a PLT entry. In
- * this case, the normal validation will fail, but we can at
- * least check that we are dealing with a branch and link
- * instruction that points into the right module.
- */
- if (aarch64_insn_read((void *)pc, &replaced))
- return -EFAULT;
-
- if (!aarch64_insn_is_bl(replaced) ||
- !within_module(pc + aarch64_get_branch_offset(replaced),
- mod))
- return -EINVAL;
-
- validate = false;
- } else {
- old = aarch64_insn_gen_branch_imm(pc, addr,
- AARCH64_INSN_BRANCH_LINK);
- }
+ if (!ftrace_find_callable_addr(rec, mod, &addr))
+ return -EINVAL;
+ old = aarch64_insn_gen_branch_imm(pc, addr, AARCH64_INSN_BRANCH_LINK);
new = aarch64_insn_gen_nop();
- return ftrace_modify_code(pc, old, new, validate);
+ return ftrace_modify_code(pc, old, new, true);
}
void arch_ftrace_update_code(int command)
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 57b30bcf9f21..f6b00743c399 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -244,6 +244,11 @@ static void mte_update_gcr_excl(struct task_struct *task)
SYS_GCR_EL1);
}
+#ifdef CONFIG_KASAN_HW_TAGS
+/* Only called from assembly, silence sparse */
+void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr,
+ __le32 *updptr, int nr_inst);
+
void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr,
__le32 *updptr, int nr_inst)
{
@@ -252,6 +257,7 @@ void __init kasan_hw_tags_enable(struct alt_instr *alt, __le32 *origptr,
if (kasan_hw_tags_enabled())
*updptr = cpu_to_le32(aarch64_insn_gen_nop());
}
+#endif
void mte_thread_init_user(void)
{
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index cf3a759f10d4..fea3223704b6 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -303,14 +303,13 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
early_fixmap_init();
early_ioremap_init();
+ setup_machine_fdt(__fdt_pointer);
+
/*
* Initialise the static keys early as they may be enabled by the
- * cpufeature code, early parameters, and DT setup.
+ * cpufeature code and early parameters.
*/
jump_label_init();
-
- setup_machine_fdt(__fdt_pointer);
-
parse_early_param();
/*
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 4e39ace073af..3b8d062e30ea 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1230,6 +1230,9 @@ bool kvm_arch_timer_get_input_level(int vintid)
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
struct arch_timer_context *timer;
+ if (WARN(!vcpu, "No vcpu context!\n"))
+ return false;
+
if (vintid == vcpu_vtimer(vcpu)->irq.irq)
timer = vcpu_vtimer(vcpu);
else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 400bb0fe2745..a0188144a122 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -150,8 +150,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_free_stage2_pgd;
- if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
+ ret = -ENOMEM;
goto out_free_stage2_pgd;
+ }
cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);
kvm_vgic_early_init(kvm);
@@ -2271,7 +2273,11 @@ static int __init early_kvm_mode_cfg(char *arg)
return -EINVAL;
if (strcmp(arg, "protected") == 0) {
- kvm_mode = KVM_MODE_PROTECTED;
+ if (!is_kernel_in_hyp_mode())
+ kvm_mode = KVM_MODE_PROTECTED;
+ else
+ pr_warn_once("Protected KVM not available with VHE\n");
+
return 0;
}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 3d251a4d2cf7..6012b08ecb14 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -80,6 +80,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED;
vcpu->arch.flags |= KVM_ARM64_FP_HOST;
+ vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED;
if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN)
vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED;
@@ -93,6 +94,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
* operations. Do this for ZA as well for now for simplicity.
*/
if (system_supports_sme()) {
+ vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED;
if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN)
vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 78edf077fa3b..1e78acf9662e 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -314,15 +314,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
enum kvm_pgtable_prot prot)
{
- hyp_assert_lock_held(&host_kvm.lock);
-
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- hyp_assert_lock_held(&host_kvm.lock);
-
return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt,
addr, size, &host_s2_pool, owner_id);
}
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index b6d86e423319..35a4331ba5f3 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -243,15 +243,9 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id)
case SYS_ID_AA64MMFR2_EL1:
return get_pvm_id_aa64mmfr2(vcpu);
default:
- /*
- * Should never happen because all cases are covered in
- * pvm_sys_reg_descs[].
- */
- WARN_ON(1);
- break;
+ /* Unhandled ID register, RAZ */
+ return 0;
}
-
- return 0;
}
static u64 read_id_reg(const struct kvm_vcpu *vcpu,
@@ -332,6 +326,16 @@ static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu,
/* Mark the specified system register as an AArch64 feature id register. */
#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 }
+/*
+ * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
+ * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
+ * (1 <= crm < 8, 0 <= Op2 < 8).
+ */
+#define ID_UNALLOCATED(crm, op2) { \
+ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \
+ .access = pvm_access_id_aarch64, \
+}
+
/* Mark the specified system register as Read-As-Zero/Write-Ignored */
#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi }
@@ -375,24 +379,46 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
AARCH32(SYS_MVFR0_EL1),
AARCH32(SYS_MVFR1_EL1),
AARCH32(SYS_MVFR2_EL1),
+ ID_UNALLOCATED(3,3),
AARCH32(SYS_ID_PFR2_EL1),
AARCH32(SYS_ID_DFR1_EL1),
AARCH32(SYS_ID_MMFR5_EL1),
+ ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
/* CRm=4 */
AARCH64(SYS_ID_AA64PFR0_EL1),
AARCH64(SYS_ID_AA64PFR1_EL1),
+ ID_UNALLOCATED(4,2),
+ ID_UNALLOCATED(4,3),
AARCH64(SYS_ID_AA64ZFR0_EL1),
+ ID_UNALLOCATED(4,5),
+ ID_UNALLOCATED(4,6),
+ ID_UNALLOCATED(4,7),
AARCH64(SYS_ID_AA64DFR0_EL1),
AARCH64(SYS_ID_AA64DFR1_EL1),
+ ID_UNALLOCATED(5,2),
+ ID_UNALLOCATED(5,3),
AARCH64(SYS_ID_AA64AFR0_EL1),
AARCH64(SYS_ID_AA64AFR1_EL1),
+ ID_UNALLOCATED(5,6),
+ ID_UNALLOCATED(5,7),
AARCH64(SYS_ID_AA64ISAR0_EL1),
AARCH64(SYS_ID_AA64ISAR1_EL1),
+ AARCH64(SYS_ID_AA64ISAR2_EL1),
+ ID_UNALLOCATED(6,3),
+ ID_UNALLOCATED(6,4),
+ ID_UNALLOCATED(6,5),
+ ID_UNALLOCATED(6,6),
+ ID_UNALLOCATED(6,7),
AARCH64(SYS_ID_AA64MMFR0_EL1),
AARCH64(SYS_ID_AA64MMFR1_EL1),
AARCH64(SYS_ID_AA64MMFR2_EL1),
+ ID_UNALLOCATED(7,3),
+ ID_UNALLOCATED(7,4),
+ ID_UNALLOCATED(7,5),
+ ID_UNALLOCATED(7,6),
+ ID_UNALLOCATED(7,7),
/* Scalable Vector Registers are restricted. */
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index 77a67e9d3d14..e070cda86e12 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -429,11 +429,11 @@ static const struct vgic_register_region vgic_v2_dist_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_SET,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- NULL, vgic_uaccess_write_spending, 1,
+ vgic_uaccess_read_pending, vgic_uaccess_write_spending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PENDING_CLEAR,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
- NULL, vgic_uaccess_write_cpending, 1,
+ vgic_uaccess_read_pending, vgic_uaccess_write_cpending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
vgic_mmio_read_active, vgic_mmio_write_sactive,
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
index f7aa7bcd6fb8..f15e29cc63ce 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c
@@ -353,42 +353,6 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
return 0;
}
-static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
-{
- u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
- u32 value = 0;
- int i;
-
- /*
- * pending state of interrupt is latched in pending_latch variable.
- * Userspace will save and restore pending state and line_level
- * separately.
- * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
- * for handling of ISPENDR and ICPENDR.
- */
- for (i = 0; i < len * 8; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- bool state = irq->pending_latch;
-
- if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
- int err;
-
- err = irq_get_irqchip_state(irq->host_irq,
- IRQCHIP_STATE_PENDING,
- &state);
- WARN_ON(err);
- }
-
- if (state)
- value |= (1U << i);
-
- vgic_put_irq(vcpu->kvm, irq);
- }
-
- return value;
-}
-
static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
@@ -666,7 +630,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
+ vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
@@ -750,7 +714,7 @@ static const struct vgic_register_region vgic_v3_rd_registers[] = {
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
vgic_mmio_read_pending, vgic_mmio_write_spending,
- vgic_v3_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
+ vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
vgic_mmio_read_pending, vgic_mmio_write_cpending,
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index 49837d3a3ef5..997d0fce2088 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -226,8 +226,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
return 0;
}
-unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+static unsigned long __read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ bool is_user)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 value = 0;
@@ -239,6 +240,15 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
unsigned long flags;
bool val;
+ /*
+ * When used from userspace with a GICv3 model:
+ *
+ * Pending state of interrupt is latched in pending_latch
+ * variable. Userspace will save and restore pending state
+ * and line_level separately.
+ * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
+ * for handling of ISPENDR and ICPENDR.
+ */
raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
int err;
@@ -248,10 +258,20 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
IRQCHIP_STATE_PENDING,
&val);
WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
- } else if (vgic_irq_is_mapped_level(irq)) {
+ } else if (!is_user && vgic_irq_is_mapped_level(irq)) {
val = vgic_get_phys_line_level(irq);
} else {
- val = irq_is_pending(irq);
+ switch (vcpu->kvm->arch.vgic.vgic_model) {
+ case KVM_DEV_TYPE_ARM_VGIC_V3:
+ if (is_user) {
+ val = irq->pending_latch;
+ break;
+ }
+ fallthrough;
+ default:
+ val = irq_is_pending(irq);
+ break;
+ }
}
value |= ((u32)val << i);
@@ -263,6 +283,18 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
return value;
}
+unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __read_pending(vcpu, addr, len, false);
+}
+
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __read_pending(vcpu, addr, len, true);
+}
+
static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
return (vgic_irq_is_sgi(irq->intid) &&
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
index 3fa696f198a3..6082d4b66d39 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.h
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -149,6 +149,9 @@ int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
+unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index 8d5f0506fd87..d78ae63d7c15 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -66,7 +66,7 @@ static void flush_context(void)
* the next context-switch, we broadcast TLB flush + I-cache
* invalidation over the inner shareable domain on rollover.
*/
- kvm_call_hyp(__kvm_flush_vm_context);
+ kvm_call_hyp(__kvm_flush_vm_context);
}
static bool check_update_reserved_vmid(u64 vmid, u64 newvmid)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 0ea6cc25dc66..21c907987080 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -218,8 +218,6 @@ SYM_FUNC_ALIAS(__dma_flush_area, __pi___dma_flush_area)
*/
SYM_FUNC_START(__pi___dma_map_area)
add x1, x0, x1
- cmp w2, #DMA_FROM_DEVICE
- b.eq __pi_dcache_inval_poc
b __pi_dcache_clean_poc
SYM_FUNC_END(__pi___dma_map_area)
SYM_FUNC_ALIAS(__dma_map_area, __pi___dma_map_area)
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6719f9efea09..6099c81b9322 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -9,9 +9,9 @@
#include <linux/dma-map-ops.h>
#include <linux/dma-iommu.h>
#include <xen/xen.h>
-#include <xen/swiotlb-xen.h>
#include <asm/cacheflush.h>
+#include <asm/xen/xen-ops.h>
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
enum dma_data_direction dir)
@@ -52,8 +52,5 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
if (iommu)
iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
-#ifdef CONFIG_XEN
- if (xen_swiotlb_detect())
- dev->dma_ops = &xen_swiotlb_dma_ops;
-#endif
+ xen_setup_dma_ops(dev);
}
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 8ab4035dea27..42f2e9a8616c 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1478,6 +1478,7 @@ skip_init_ctx:
bpf_jit_binary_free(header);
prog->bpf_func = NULL;
prog->jited = 0;
+ prog->jited_len = 0;
goto out_off;
}
bpf_jit_binary_lock_ro(header);
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index 89bfb74e28de..5c55509eb43f 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -253,7 +253,7 @@ END {
next
}
-/0b[01]+/ && block = "Enum" {
+/0b[01]+/ && block == "Enum" {
expect_fields(2)
val = $1
name = $2
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 80657bf83b05..1920d52653b4 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -343,6 +343,7 @@ config NR_CPUS
config NUMA
bool "NUMA Support"
+ select SMP
select ACPI_NUMA if ACPI
help
Say Y to compile the kernel with NUMA (Non-Uniform Memory Access)
diff --git a/arch/loongarch/include/asm/hardirq.h b/arch/loongarch/include/asm/hardirq.h
index befe8184aa08..0ef3b18f8980 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -19,7 +19,7 @@ typedef struct {
unsigned int __softirq_pending;
} ____cacheline_aligned irq_cpustat_t;
-DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
+DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
#define __ARCH_IRQ_STAT
diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h
index 34f15a6fb1e7..e6569f18c6dd 100644
--- a/arch/loongarch/include/asm/percpu.h
+++ b/arch/loongarch/include/asm/percpu.h
@@ -6,6 +6,7 @@
#define __ASM_PERCPU_H
#include <asm/cmpxchg.h>
+#include <asm/loongarch.h>
/* Use r21 for fast access */
register unsigned long __my_cpu_offset __asm__("$r21");
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index 551e1f37c705..71189b28bfb2 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -9,10 +9,16 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/linkage.h>
-#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
+extern int smp_num_siblings;
+extern int num_processors;
+extern int disabled_cpus;
+extern cpumask_t cpu_sibling_map[];
+extern cpumask_t cpu_core_map[];
+extern cpumask_t cpu_foreign_map[];
+
void loongson3_smp_setup(void);
void loongson3_prepare_cpus(unsigned int max_cpus);
void loongson3_boot_secondary(int cpu, struct task_struct *idle);
@@ -25,26 +31,11 @@ int loongson3_cpu_disable(void);
void loongson3_cpu_die(unsigned int cpu);
#endif
-#ifdef CONFIG_SMP
-
static inline void plat_smp_setup(void)
{
loongson3_smp_setup();
}
-#else /* !CONFIG_SMP */
-
-static inline void plat_smp_setup(void) { }
-
-#endif /* !CONFIG_SMP */
-
-extern int smp_num_siblings;
-extern int num_processors;
-extern int disabled_cpus;
-extern cpumask_t cpu_sibling_map[];
-extern cpumask_t cpu_core_map[];
-extern cpumask_t cpu_foreign_map[];
-
static inline int raw_smp_processor_id(void)
{
#if defined(__VDSO__)
diff --git a/arch/loongarch/include/asm/timex.h b/arch/loongarch/include/asm/timex.h
index d3ed99a4fdbd..fb41e9e7a222 100644
--- a/arch/loongarch/include/asm/timex.h
+++ b/arch/loongarch/include/asm/timex.h
@@ -12,13 +12,6 @@
#include <asm/cpu.h>
#include <asm/cpu-features.h>
-/*
- * Standard way to access the cycle counter.
- * Currently only used on SMP for scheduling.
- *
- * We know that all SMP capable CPUs have cycle counters.
- */
-
typedef unsigned long cycles_t;
#define get_cycles get_cycles
diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c
index b16c3dea5eeb..bb729ee8a237 100644
--- a/arch/loongarch/kernel/acpi.c
+++ b/arch/loongarch/kernel/acpi.c
@@ -138,6 +138,7 @@ void __init acpi_boot_table_init(void)
}
}
+#ifdef CONFIG_SMP
static int set_processor_mask(u32 id, u32 flags)
{
@@ -166,15 +167,18 @@ static int set_processor_mask(u32 id, u32 flags)
return cpu;
}
+#endif
static void __init acpi_process_madt(void)
{
+#ifdef CONFIG_SMP
int i;
for (i = 0; i < NR_CPUS; i++) {
__cpu_number_map[i] = -1;
__cpu_logical_map[i] = -1;
}
+#endif
loongson_sysconf.nr_cpus = num_processors;
}
diff --git a/arch/loongarch/kernel/cacheinfo.c b/arch/loongarch/kernel/cacheinfo.c
index 8c9fe29e98f0..b38f5489d094 100644
--- a/arch/loongarch/kernel/cacheinfo.c
+++ b/arch/loongarch/kernel/cacheinfo.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
+#include <asm/cpu-info.h>
#include <linux/cacheinfo.h>
/* Populates leaf and increments to next leaf */
diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index 4b671d305ede..b34b8d792aa4 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -22,6 +22,8 @@
#include <asm/setup.h>
DEFINE_PER_CPU(unsigned long, irq_stack);
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
struct irq_domain *cpu_domain;
struct irq_domain *liointc_domain;
@@ -56,8 +58,11 @@ int arch_show_interrupts(struct seq_file *p, int prec)
void __init init_IRQ(void)
{
- int i, r, ipi_irq;
+ int i;
+#ifdef CONFIG_SMP
+ int r, ipi_irq;
static int ipi_dummy_dev;
+#endif
unsigned int order = get_order(IRQ_STACK_SIZE);
struct page *page;
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index 6d944d65f600..bfa0dfe8b7d7 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -120,10 +120,12 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
/*
* Copy architecture-specific thread state
*/
-int copy_thread(unsigned long clone_flags, unsigned long usp,
- unsigned long kthread_arg, struct task_struct *p, unsigned long tls)
+int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long childksp;
+ unsigned long tls = args->tls;
+ unsigned long usp = args->stack;
+ unsigned long clone_flags = args->flags;
struct pt_regs *childregs, *regs = current_pt_regs();
childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32;
@@ -136,12 +138,12 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
p->thread.csr_crmd = csr_read32(LOONGARCH_CSR_CRMD);
p->thread.csr_prmd = csr_read32(LOONGARCH_CSR_PRMD);
p->thread.csr_ecfg = csr_read32(LOONGARCH_CSR_ECFG);
- if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+ if (unlikely(args->fn)) {
/* kernel thread */
- p->thread.reg23 = usp; /* fn */
- p->thread.reg24 = kthread_arg;
p->thread.reg03 = childksp;
- p->thread.reg01 = (unsigned long) ret_from_kernel_thread;
+ p->thread.reg23 = (unsigned long)args->fn;
+ p->thread.reg24 = (unsigned long)args->fn_arg;
+ p->thread.reg01 = (unsigned long)ret_from_kernel_thread;
memset(childregs, 0, sizeof(struct pt_regs));
childregs->csr_euen = p->thread.csr_euen;
childregs->csr_crmd = p->thread.csr_crmd;
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 185e4035811a..c74860b53375 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -39,7 +39,6 @@
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/setup.h>
-#include <asm/smp.h>
#include <asm/time.h>
#define SMBIOS_BIOSSIZE_OFFSET 0x09
@@ -349,8 +348,6 @@ static void __init prefill_possible_map(void)
nr_cpu_ids = possible;
}
-#else
-static inline void prefill_possible_map(void) {}
#endif
void __init setup_arch(char **cmdline_p)
@@ -367,8 +364,10 @@ void __init setup_arch(char **cmdline_p)
arch_mem_init(cmdline_p);
resource_init();
+#ifdef CONFIG_SMP
plat_smp_setup();
prefill_possible_map();
+#endif
paging_init();
}
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index b8c53b755a25..73cec62504fb 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -66,8 +66,6 @@ static cpumask_t cpu_core_setup_map;
struct secondary_data cpuboot_data;
static DEFINE_PER_CPU(int, cpu_state);
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
enum ipi_msg_type {
IPI_RESCHEDULE,
diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S
index 9d508158fe1a..78311a6101a3 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -101,6 +101,7 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
+ ELF_DETAILS
.gptab.sdata : {
*(.gptab.data)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index be68c1f02b79..c2ce2e60c8f0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -223,7 +223,6 @@ config PPC
select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IOREMAP_PROT
- select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE
@@ -786,7 +785,6 @@ config THREAD_SHIFT
range 13 15
default "15" if PPC_256K_PAGES
default "14" if PPC64
- default "14" if KASAN
default "13"
help
Used to define the stack size. The default is almost always what you
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 125328d1b980..af58f1ed3952 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -14,10 +14,16 @@
#ifdef __KERNEL__
-#if defined(CONFIG_VMAP_STACK) && CONFIG_THREAD_SHIFT < PAGE_SHIFT
+#ifdef CONFIG_KASAN
+#define MIN_THREAD_SHIFT (CONFIG_THREAD_SHIFT + 1)
+#else
+#define MIN_THREAD_SHIFT CONFIG_THREAD_SHIFT
+#endif
+
+#if defined(CONFIG_VMAP_STACK) && MIN_THREAD_SHIFT < PAGE_SHIFT
#define THREAD_SHIFT PAGE_SHIFT
#else
-#define THREAD_SHIFT CONFIG_THREAD_SHIFT
+#define THREAD_SHIFT MIN_THREAD_SHIFT
#endif
#define THREAD_SIZE (1 << THREAD_SHIFT)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 2e2a2a9bcf43..f91f0f29a566 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -37,6 +37,8 @@ KASAN_SANITIZE_paca.o := n
KASAN_SANITIZE_setup_64.o := n
KASAN_SANITIZE_mce.o := n
KASAN_SANITIZE_mce_power.o := n
+KASAN_SANITIZE_udbg.o := n
+KASAN_SANITIZE_udbg_16550.o := n
# we have to be particularly careful in ppc64 to exclude code that
# runs with translations off, as we cannot access the shadow with
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b62046bf3bb8..ee0433809621 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -2158,12 +2158,12 @@ static unsigned long ___get_wchan(struct task_struct *p)
return 0;
do {
- sp = *(unsigned long *)sp;
+ sp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
task_is_running(p))
return 0;
if (count > 0) {
- ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE];
+ ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]);
if (!in_sched_functions(ip))
return ip;
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace-fpu.c b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
index 5dca19361316..09c49632bfe5 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-fpu.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-fpu.c
@@ -17,9 +17,13 @@ int ptrace_get_fpr(struct task_struct *child, int index, unsigned long *data)
#ifdef CONFIG_PPC_FPU_REGS
flush_fp_to_thread(child);
- if (fpidx < (PT_FPSCR - PT_FPR0))
- memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long));
- else
+ if (fpidx < (PT_FPSCR - PT_FPR0)) {
+ if (IS_ENABLED(CONFIG_PPC32))
+ // On 32-bit the index we are passed refers to 32-bit words
+ *data = ((u32 *)child->thread.fp_state.fpr)[fpidx];
+ else
+ memcpy(data, &child->thread.TS_FPR(fpidx), sizeof(long));
+ } else
*data = child->thread.fp_state.fpscr;
#else
*data = 0;
@@ -39,9 +43,13 @@ int ptrace_put_fpr(struct task_struct *child, int index, unsigned long data)
#ifdef CONFIG_PPC_FPU_REGS
flush_fp_to_thread(child);
- if (fpidx < (PT_FPSCR - PT_FPR0))
- memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long));
- else
+ if (fpidx < (PT_FPSCR - PT_FPR0)) {
+ if (IS_ENABLED(CONFIG_PPC32))
+ // On 32-bit the index we are passed refers to 32-bit words
+ ((u32 *)child->thread.fp_state.fpr)[fpidx] = data;
+ else
+ memcpy(&child->thread.TS_FPR(fpidx), &data, sizeof(long));
+ } else
child->thread.fp_state.fpscr = data;
#endif
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
index 4d2dc22d4a2d..5d7a72b41ae7 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -444,4 +444,7 @@ void __init pt_regs_check(void)
* real registers.
*/
BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long));
+
+ // ptrace_get/put_fpr() rely on PPC32 and VSX being incompatible
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC32) && IS_ENABLED(CONFIG_VSX));
}
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 9bb43aa53d43..a6fce3106e02 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -993,8 +993,8 @@ int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...)
*
* Return: A pointer to the specified errorlog or NULL if not found.
*/
-struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
- uint16_t section_id)
+noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
+ uint16_t section_id)
{
struct rtas_ext_event_log_v6 *ext_log =
(struct rtas_ext_event_log_v6 *)log->buffer;
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index d85fa9fc6f3c..80f54723cf6d 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -224,7 +224,7 @@ void crash_kexec_secondary(struct pt_regs *regs)
/* wait for all the CPUs to hit real mode but timeout if they don't come in */
#if defined(CONFIG_SMP) && defined(CONFIG_PPC64)
-static void __maybe_unused crash_kexec_wait_realmode(int cpu)
+noinstr static void __maybe_unused crash_kexec_wait_realmode(int cpu)
{
unsigned int msecs;
int i;
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c
index 1f3f9fedf1bc..0d04f9d5da8d 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -19,7 +19,6 @@
#include <asm/cacheflush.h>
#include <asm/kdump.h>
#include <mm/mmu_decl.h>
-#include <generated/compile.h>
#include <generated/utsrelease.h>
struct regions {
@@ -37,10 +36,6 @@ struct regions {
int reserved_mem_size_cells;
};
-/* Simplified build-specific string for starting entropy. */
-static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
- LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
-
struct regions __initdata regions;
static __init void kaslr_get_cmdline(void *fdt)
@@ -71,7 +66,8 @@ static unsigned long __init get_boot_seed(void *fdt)
{
unsigned long hash = 0;
- hash = rotate_xor(hash, build_str, sizeof(build_str));
+ /* build-specific string for starting entropy. */
+ hash = rotate_xor(hash, linux_banner, strlen(linux_banner));
hash = rotate_xor(hash, fdt, fdt_totalsize(fdt));
return hash;
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 6488b3842199..19f0fc5c6f1b 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -4,6 +4,7 @@
# in particular, idle code runs a bunch of things in real mode
KASAN_SANITIZE_idle.o := n
KASAN_SANITIZE_pci-ioda.o := n
+KASAN_SANITIZE_pci-ioda-tce.o := n
# pnv_machine_check_early
KASAN_SANITIZE_setup.o := n
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 181b855b3050..82cae08976bc 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -465,6 +465,9 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu
u32 available_events;
int index, rc = 0;
+ if (!p->stat_buffer_len)
+ return -ENOENT;
+
available_events = (p->stat_buffer_len - sizeof(struct papr_scm_perf_stats))
/ sizeof(struct papr_scm_perf_stat);
if (available_events == 0)
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index c22f58155948..32ffef9f6e5b 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -364,8 +364,13 @@ config RISCV_ISA_SVPBMT
select RISCV_ALTERNATIVE
default y
help
- Adds support to dynamically detect the presence of the SVPBMT extension
- (Supervisor-mode: page-based memory types) and enable its usage.
+ Adds support to dynamically detect the presence of the SVPBMT
+ ISA-extension (Supervisor-mode: page-based memory types) and
+ enable its usage.
+
+ The memory type for a page contains a combination of attributes
+ that indicate the cacheability, idempotency, and ordering
+ properties for access to that page.
The SVPBMT extension is only available on 64Bit cpus.
diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas
index ebfcd5cc6eaf..457ac72c9b36 100644
--- a/arch/riscv/Kconfig.erratas
+++ b/arch/riscv/Kconfig.erratas
@@ -35,6 +35,7 @@ config ERRATA_SIFIVE_CIP_1200
config ERRATA_THEAD
bool "T-HEAD errata"
+ depends on !XIP_KERNEL
select RISCV_ALTERNATIVE
help
All T-HEAD errata Kconfig depend on this Kconfig. Disabling
diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi
index 8c3259134194..3095d08453a1 100644
--- a/arch/riscv/boot/dts/microchip/mpfs.dtsi
+++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi
@@ -192,6 +192,15 @@
riscv,ndev = <186>;
};
+ pdma: dma-controller@3000000 {
+ compatible = "sifive,fu540-c000-pdma", "sifive,pdma0";
+ reg = <0x0 0x3000000 0x0 0x8000>;
+ interrupt-parent = <&plic>;
+ interrupts = <5 6>, <7 8>, <9 10>, <11 12>;
+ dma-channels = <4>;
+ #dma-cells = <1>;
+ };
+
clkcfg: clkcfg@20002000 {
compatible = "microchip,mpfs-clkcfg";
reg = <0x0 0x20002000 0x0 0x1000>, <0x0 0x3E001000 0x0 0x1000>;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index a6f62a6d1edd..12b05ce164bb 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -293,7 +293,6 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
unsigned int stage)
{
u32 cpu_req_feature = cpufeature_probe(stage);
- u32 cpu_apply_feature = 0;
struct alt_entry *alt;
u32 tmp;
@@ -307,10 +306,8 @@ void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
}
tmp = (1U << alt->errata_id);
- if (cpu_req_feature & tmp) {
+ if (cpu_req_feature & tmp)
patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
- cpu_apply_feature |= tmp;
- }
}
}
#endif
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index 9f764df125db..6cd93995fb65 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -97,7 +97,7 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu)
* We ran out of VMIDs so we increment vmid_version and
* start assigning VMIDs from 1.
*
- * This also means existing VMIDs assignement to all Guest
+ * This also means existing VMIDs assignment to all Guest
* instances is invalid and we have force VMID re-assignement
* for all Guest instances. The Guest instances that were not
* running will automatically pick-up new VMIDs because will
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b1a88f6cc349..91c0b80a8bf0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -125,6 +125,7 @@ config S390
select CLONE_BACKWARDS2
select DMA_OPS if PCI
select DYNAMIC_FTRACE if FUNCTION_TRACER
+ select GCC12_NO_ARRAY_BOUNDS
select GENERIC_ALLOCATOR
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
@@ -768,7 +769,6 @@ menu "Virtualization"
config PROTECTED_VIRTUALIZATION_GUEST
def_bool n
prompt "Protected virtualization guest support"
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
help
Select this option, if you want to be able to run this
kernel as a protected virtualization KVM guest.
diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index d73611b35164..495c68a4df1e 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -32,15 +32,7 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector
KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g)
KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,))
-
-ifdef CONFIG_CC_IS_GCC
- ifeq ($(call cc-ifversion, -ge, 1200, y), y)
- ifeq ($(call cc-ifversion, -lt, 1300, y), y)
- KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds)
- KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds)
- endif
- endif
-endif
+KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_CC_NO_ARRAY_BOUNDS),-Wno-array-bounds)
UTS_MACHINE := s390x
STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 6fb6bf64326f..6a0ac00d5a42 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -31,6 +31,7 @@
#include <linux/cma.h>
#include <linux/gfp.h>
#include <linux/dma-direct.h>
+#include <linux/platform-feature.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
@@ -168,22 +169,14 @@ bool force_dma_unencrypted(struct device *dev)
return is_prot_virt_guest();
}
-#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
-
-int arch_has_restricted_virtio_memory_access(void)
-{
- return is_prot_virt_guest();
-}
-EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access);
-
-#endif
-
/* protected virtualization */
static void pv_init(void)
{
if (!is_prot_virt_guest())
return;
+ platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
+
/* make sure bounce buffers are shared */
swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index 5c092a9153ea..027847023184 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -544,6 +544,8 @@ static int um_pci_init_vqs(struct um_pci_device *dev)
dev->cmd_vq = vqs[0];
dev->irq_vq = vqs[1];
+ virtio_device_ready(dev->vdev);
+
for (i = 0; i < NUM_IRQ_MSGS; i++) {
void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL);
@@ -587,7 +589,7 @@ static int um_pci_virtio_probe(struct virtio_device *vdev)
dev->irq = irq_alloc_desc(numa_node_id());
if (dev->irq < 0) {
err = dev->irq;
- goto error;
+ goto err_reset;
}
um_pci_devices[free].dev = dev;
vdev->priv = dev;
@@ -604,6 +606,9 @@ static int um_pci_virtio_probe(struct virtio_device *vdev)
um_pci_rescan();
return 0;
+err_reset:
+ virtio_reset_device(vdev);
+ vdev->config->del_vqs(vdev);
error:
mutex_unlock(&um_pci_mtx);
kfree(dev);
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9783ebc4e021..be0b95e51df6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1542,7 +1542,6 @@ config X86_CPA_STATISTICS
config X86_MEM_ENCRYPT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select DYNAMIC_PHYSICAL_MASK
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
def_bool n
config AMD_MEM_ENCRYPT
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 03deb4d6920d..928dcf7a20d9 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -124,6 +124,51 @@ static u64 get_cc_mask(void)
return BIT_ULL(gpa_width - 1);
}
+/*
+ * The TDX module spec states that #VE may be injected for a limited set of
+ * reasons:
+ *
+ * - Emulation of the architectural #VE injection on EPT violation;
+ *
+ * - As a result of guest TD execution of a disallowed instruction,
+ * a disallowed MSR access, or CPUID virtualization;
+ *
+ * - A notification to the guest TD about anomalous behavior;
+ *
+ * The last one is opt-in and is not used by the kernel.
+ *
+ * The Intel Software Developer's Manual describes cases when instruction
+ * length field can be used in section "Information for VM Exits Due to
+ * Instruction Execution".
+ *
+ * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
+ * information if #VE occurred due to instruction execution, but not for EPT
+ * violations.
+ */
+static int ve_instr_len(struct ve_info *ve)
+{
+ switch (ve->exit_reason) {
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_IO_INSTRUCTION:
+ /* It is safe to use ve->instr_len for #VE due instructions */
+ return ve->instr_len;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * For EPT violations, ve->insn_len is not defined. For those,
+ * the kernel must decode instructions manually and should not
+ * be using this function.
+ */
+ WARN_ONCE(1, "ve->instr_len is not defined for EPT violations");
+ return 0;
+ default:
+ WARN_ONCE(1, "Unexpected #VE-type: %lld\n", ve->exit_reason);
+ return ve->instr_len;
+ }
+}
+
static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
{
struct tdx_hypercall_args args = {
@@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
}
-static bool handle_halt(void)
+static int handle_halt(struct ve_info *ve)
{
/*
* Since non safe halt is mainly used in CPU offlining
@@ -158,9 +203,9 @@ static bool handle_halt(void)
const bool do_sti = false;
if (__halt(irq_disabled, do_sti))
- return false;
+ return -EIO;
- return true;
+ return ve_instr_len(ve);
}
void __cpuidle tdx_safe_halt(void)
@@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
-static bool read_msr(struct pt_regs *regs)
+static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs)
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
- return false;
+ return -EIO;
regs->ax = lower_32_bits(args.r11);
regs->dx = upper_32_bits(args.r11);
- return true;
+ return ve_instr_len(ve);
}
-static bool write_msr(struct pt_regs *regs)
+static int write_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
*/
- return !__tdx_hypercall(&args, 0);
+ if (__tdx_hypercall(&args, 0))
+ return -EIO;
+
+ return ve_instr_len(ve);
}
-static bool handle_cpuid(struct pt_regs *regs)
+static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs)
*/
if (regs->ax < 0x40000000 || regs->ax > 0x4FFFFFFF) {
regs->ax = regs->bx = regs->cx = regs->dx = 0;
- return true;
+ return ve_instr_len(ve);
}
/*
@@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs)
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
- return false;
+ return -EIO;
/*
* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
@@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs)
regs->cx = args.r14;
regs->dx = args.r15;
- return true;
+ return ve_instr_len(ve);
}
static bool mmio_read(int size, unsigned long addr, unsigned long *val)
@@ -283,10 +331,10 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
EPT_WRITE, addr, val);
}
-static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
{
+ unsigned long *reg, val, vaddr;
char buffer[MAX_INSN_SIZE];
- unsigned long *reg, val;
struct insn insn = {};
enum mmio_type mmio;
int size, extend_size;
@@ -294,34 +342,49 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
/* Only in-kernel MMIO is supported */
if (WARN_ON_ONCE(user_mode(regs)))
- return false;
+ return -EFAULT;
if (copy_from_kernel_nofault(buffer, (void *)regs->ip, MAX_INSN_SIZE))
- return false;
+ return -EFAULT;
if (insn_decode(&insn, buffer, MAX_INSN_SIZE, INSN_MODE_64))
- return false;
+ return -EINVAL;
mmio = insn_decode_mmio(&insn, &size);
if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
- return false;
+ return -EINVAL;
if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
reg = insn_get_modrm_reg_ptr(&insn, regs);
if (!reg)
- return false;
+ return -EINVAL;
}
- ve->instr_len = insn.length;
+ /*
+ * Reject EPT violation #VEs that split pages.
+ *
+ * MMIO accesses are supposed to be naturally aligned and therefore
+ * never cross page boundaries. Seeing split page accesses indicates
+ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
+ *
+ * load_unaligned_zeropad() will recover using exception fixups.
+ */
+ vaddr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1) / PAGE_SIZE)
+ return -EFAULT;
/* Handle writes first */
switch (mmio) {
case MMIO_WRITE:
memcpy(&val, reg, size);
- return mmio_write(size, ve->gpa, val);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
case MMIO_WRITE_IMM:
val = insn.immediate.value;
- return mmio_write(size, ve->gpa, val);
+ if (!mmio_write(size, ve->gpa, val))
+ return -EIO;
+ return insn.length;
case MMIO_READ:
case MMIO_READ_ZERO_EXTEND:
case MMIO_READ_SIGN_EXTEND:
@@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
* decoded or handled properly. It was likely not using io.h
* helpers or accessed MMIO accidentally.
*/
- return false;
+ return -EINVAL;
default:
WARN_ONCE(1, "Unknown insn_decode_mmio() decode value?");
- return false;
+ return -EINVAL;
}
/* Handle reads */
if (!mmio_read(size, ve->gpa, &val))
- return false;
+ return -EIO;
switch (mmio) {
case MMIO_READ:
@@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
default:
/* All other cases has to be covered with the first switch() */
WARN_ON_ONCE(1);
- return false;
+ return -EINVAL;
}
if (extend_size)
memset(reg, extend_val, extend_size);
memcpy(reg, &val, size);
- return true;
+ return insn.length;
}
static bool handle_in(struct pt_regs *regs, int size, int port)
@@ -421,13 +484,14 @@ static bool handle_out(struct pt_regs *regs, int size, int port)
*
* Return True on success or False on failure.
*/
-static bool handle_io(struct pt_regs *regs, u32 exit_qual)
+static int handle_io(struct pt_regs *regs, struct ve_info *ve)
{
+ u32 exit_qual = ve->exit_qual;
int size, port;
- bool in;
+ bool in, ret;
if (VE_IS_IO_STRING(exit_qual))
- return false;
+ return -EIO;
in = VE_IS_IO_IN(exit_qual);
size = VE_GET_IO_SIZE(exit_qual);
@@ -435,9 +499,13 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
if (in)
- return handle_in(regs, size, port);
+ ret = handle_in(regs, size, port);
else
- return handle_out(regs, size, port);
+ ret = handle_out(regs, size, port);
+ if (!ret)
+ return -EIO;
+
+ return ve_instr_len(ve);
}
/*
@@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
__init bool tdx_early_handle_ve(struct pt_regs *regs)
{
struct ve_info ve;
+ int insn_len;
tdx_get_ve_info(&ve);
if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION)
return false;
- return handle_io(regs, ve.exit_qual);
+ insn_len = handle_io(regs, &ve);
+ if (insn_len < 0)
+ return false;
+
+ regs->ip += insn_len;
+ return true;
}
void tdx_get_ve_info(struct ve_info *ve)
@@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve)
ve->instr_info = upper_32_bits(out.r10);
}
-/* Handle the user initiated #VE */
-static bool virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
+/*
+ * Handle the user initiated #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
{
switch (ve->exit_reason) {
case EXIT_REASON_CPUID:
- return handle_cpuid(regs);
+ return handle_cpuid(regs, ve);
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ return -EIO;
}
}
-/* Handle the kernel #VE */
-static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
+/*
+ * Handle the kernel #VE.
+ *
+ * On success, returns the number of bytes RIP should be incremented (>=0)
+ * or -errno on error.
+ */
+static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
{
switch (ve->exit_reason) {
case EXIT_REASON_HLT:
- return handle_halt();
+ return handle_halt(ve);
case EXIT_REASON_MSR_READ:
- return read_msr(regs);
+ return read_msr(regs, ve);
case EXIT_REASON_MSR_WRITE:
- return write_msr(regs);
+ return write_msr(regs, ve);
case EXIT_REASON_CPUID:
- return handle_cpuid(regs);
+ return handle_cpuid(regs, ve);
case EXIT_REASON_EPT_VIOLATION:
return handle_mmio(regs, ve);
case EXIT_REASON_IO_INSTRUCTION:
- return handle_io(regs, ve->exit_qual);
+ return handle_io(regs, ve);
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
- return false;
+ return -EIO;
}
}
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve)
{
- bool ret;
+ int insn_len;
if (user_mode(regs))
- ret = virt_exception_user(regs, ve);
+ insn_len = virt_exception_user(regs, ve);
else
- ret = virt_exception_kernel(regs, ve);
+ insn_len = virt_exception_kernel(regs, ve);
+ if (insn_len < 0)
+ return false;
/* After successful #VE handling, move the IP */
- if (ret)
- regs->ip += ve->instr_len;
+ regs->ip += insn_len;
- return ret;
+ return true;
}
static bool tdx_tlb_flush_required(bool private)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 8b392b6b7b93..3de6d8b53367 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <asm/apic.h>
#include <asm/desc.h>
+#include <asm/sev.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -405,6 +406,11 @@ void __init hyperv_init(void)
}
if (hv_isolation_type_snp()) {
+ /* Negotiate GHCB Version. */
+ if (!hv_ghcb_negotiate_protocol())
+ hv_ghcb_terminate(SEV_TERM_SET_GEN,
+ GHCB_SEV_ES_PROT_UNSUPPORTED);
+
hv_ghcb_pg = alloc_percpu(union hv_ghcb *);
if (!hv_ghcb_pg)
goto free_vp_assist_page;
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 2b994117581e..1dbcbd9da74d 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -53,6 +53,8 @@ union hv_ghcb {
} hypercall;
} __packed __aligned(HV_HYP_PAGE_SIZE);
+static u16 hv_ghcb_version __ro_after_init;
+
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
{
union hv_ghcb *hv_ghcb;
@@ -96,12 +98,85 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
return status;
}
+static inline u64 rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static inline void wr_ghcb_msr(u64 val)
+{
+ native_wrmsrl(MSR_AMD64_SEV_ES_GHCB, val);
+}
+
+static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code,
+ u64 exit_info_1, u64 exit_info_2)
+{
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = hv_ghcb_version;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ VMGEXIT();
+
+ if (ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0))
+ return ES_VMM_ERROR;
+ else
+ return ES_OK;
+}
+
+void hv_ghcb_terminate(unsigned int set, unsigned int reason)
+{
+ u64 val = GHCB_MSR_TERM_REQ;
+
+ /* Tell the hypervisor what went wrong. */
+ val |= GHCB_SEV_TERM_REASON(set, reason);
+
+ /* Request Guest Termination from Hypvervisor */
+ wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+bool hv_ghcb_negotiate_protocol(void)
+{
+ u64 ghcb_gpa;
+ u64 val;
+
+ /* Save ghcb page gpa. */
+ ghcb_gpa = rd_ghcb_msr();
+
+ /* Do the GHCB protocol version negotiation */
+ wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
+ VMGEXIT();
+ val = rd_ghcb_msr();
+
+ if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
+ return false;
+
+ if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
+ GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
+ return false;
+
+ hv_ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val),
+ GHCB_PROTOCOL_MAX);
+
+ /* Write ghcb page back after negotiating protocol. */
+ wr_ghcb_msr(ghcb_gpa);
+ VMGEXIT();
+
+ return true;
+}
+
void hv_ghcb_msr_write(u64 msr, u64 value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
unsigned long flags;
- struct es_em_ctxt ctxt;
if (!hv_ghcb_pg)
return;
@@ -120,8 +195,7 @@ void hv_ghcb_msr_write(u64 msr, u64 value)
ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value));
ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value));
- if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
- SVM_EXIT_MSR, 1, 0))
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 1, 0))
pr_warn("Fail to write msr via ghcb %llx.\n", msr);
local_irq_restore(flags);
@@ -133,7 +207,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
union hv_ghcb *hv_ghcb;
void **ghcb_base;
unsigned long flags;
- struct es_em_ctxt ctxt;
/* Check size of union hv_ghcb here. */
BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE);
@@ -152,8 +225,7 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
}
ghcb_set_rcx(&hv_ghcb->ghcb, msr);
- if (sev_es_ghcb_hv_call(&hv_ghcb->ghcb, false, &ctxt,
- SVM_EXIT_MSR, 0, 0))
+ if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 0, 0))
pr_warn("Fail to read msr via ghcb %llx.\n", msr);
else
*value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 393f2bbb5e3a..03acc823838a 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -446,5 +446,6 @@
#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */
#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
+#define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h
index 5a39ed59b6db..e8f58ddd06d9 100644
--- a/arch/x86/include/asm/e820/api.h
+++ b/arch/x86/include/asm/e820/api.h
@@ -4,9 +4,6 @@
#include <asm/e820/types.h>
-struct device;
-struct resource;
-
extern struct e820_table *e820_table;
extern struct e820_table *e820_table_kexec;
extern struct e820_table *e820_table_firmware;
@@ -46,8 +43,6 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn);
extern int e820__get_entry_type(u64 start, u64 end);
-extern void remove_e820_regions(struct device *dev, struct resource *avail);
-
/*
* Returns true iff the specified range [start,end) is completely contained inside
* the ISA region.
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 71943dce691e..9636742a80f2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -323,7 +323,7 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_get_memory_space_descriptor(phys, desc) \
(__efi64_split(phys), (desc))
-#define __efi64_argmap_set_memory_space_descriptor(phys, size, flags) \
+#define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \
(__efi64_split(phys), __efi64_split(size), __efi64_split(flags))
/*
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 959d66b9be94..9217bd6cf0d1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -653,6 +653,7 @@ struct kvm_vcpu_arch {
u64 ia32_misc_enable_msr;
u64 smbase;
u64 smi_count;
+ bool at_instruction_boundary;
bool tpr_access_reporting;
bool xsaves_enabled;
bool xfd_no_write_intercept;
@@ -1046,14 +1047,77 @@ struct kvm_x86_msr_filter {
};
enum kvm_apicv_inhibit {
+
+ /********************************************************************/
+ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
+ /********************************************************************/
+
+ /*
+ * APIC acceleration is disabled by a module parameter
+ * and/or not supported in hardware.
+ */
APICV_INHIBIT_REASON_DISABLE,
+
+ /*
+ * APIC acceleration is inhibited because AutoEOI feature is
+ * being used by a HyperV guest.
+ */
APICV_INHIBIT_REASON_HYPERV,
+
+ /*
+ * APIC acceleration is inhibited because the userspace didn't yet
+ * enable the kernel/split irqchip.
+ */
+ APICV_INHIBIT_REASON_ABSENT,
+
+ /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
+ * (out of band, debug measure of blocking all interrupts on this vCPU)
+ * was enabled, to avoid AVIC/APICv bypassing it.
+ */
+ APICV_INHIBIT_REASON_BLOCKIRQ,
+
+ /*
+ * For simplicity, the APIC acceleration is inhibited
+ * first time either APIC ID or APIC base are changed by the guest
+ * from their reset values.
+ */
+ APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
+
+ /******************************************************/
+ /* INHIBITs that are relevant only to the AMD's AVIC. */
+ /******************************************************/
+
+ /*
+ * AVIC is inhibited on a vCPU because it runs a nested guest.
+ *
+ * This is needed because unlike APICv, the peers of this vCPU
+ * cannot use the doorbell mechanism to signal interrupts via AVIC when
+ * a vCPU runs nested.
+ */
APICV_INHIBIT_REASON_NESTED,
+
+ /*
+ * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
+ * which cannot be injected when the AVIC is enabled, thus AVIC
+ * is inhibited while KVM waits for IRQ window.
+ */
APICV_INHIBIT_REASON_IRQWIN,
+
+ /*
+ * PIT (i8254) 're-inject' mode, relies on EOI intercept,
+ * which AVIC doesn't support for edge triggered interrupts.
+ */
APICV_INHIBIT_REASON_PIT_REINJ,
+
+ /*
+ * AVIC is inhibited because the guest has x2apic in its CPUID.
+ */
APICV_INHIBIT_REASON_X2APIC,
- APICV_INHIBIT_REASON_BLOCKIRQ,
- APICV_INHIBIT_REASON_ABSENT,
+
+ /*
+ * AVIC is disabled because SEV doesn't support it.
+ */
APICV_INHIBIT_REASON_SEV,
};
@@ -1300,6 +1364,8 @@ struct kvm_vcpu_stat {
u64 nested_run;
u64 directed_yield_attempted;
u64 directed_yield_successful;
+ u64 preemption_reported;
+ u64 preemption_other;
u64 guest_mode;
};
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index a82f603d4312..61f0c206bff0 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -179,9 +179,13 @@ int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
#ifdef CONFIG_AMD_MEM_ENCRYPT
void hv_ghcb_msr_write(u64 msr, u64 value);
void hv_ghcb_msr_read(u64 msr, u64 *value);
+bool hv_ghcb_negotiate_protocol(void);
+void hv_ghcb_terminate(unsigned int set, unsigned int reason);
#else
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
+static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
#endif
extern bool hv_isolation_type_snp(void);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 403e83b4adc8..d27e0581b777 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -116,6 +116,30 @@
* Not susceptible to
* TSX Async Abort (TAA) vulnerabilities.
*/
+#define ARCH_CAP_SBDR_SSDP_NO BIT(13) /*
+ * Not susceptible to SBDR and SSDP
+ * variants of Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FBSDP_NO BIT(14) /*
+ * Not susceptible to FBSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_PSDP_NO BIT(15) /*
+ * Not susceptible to PSDP variant of
+ * Processor MMIO stale data
+ * vulnerabilities.
+ */
+#define ARCH_CAP_FB_CLEAR BIT(17) /*
+ * VERW clears CPU fill buffer
+ * even on MDS_NO CPUs.
+ */
+#define ARCH_CAP_FB_CLEAR_CTRL BIT(18) /*
+ * MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
+ * bit available to control VERW
+ * behavior.
+ */
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
@@ -133,6 +157,7 @@
#define MSR_IA32_MCU_OPT_CTRL 0x00000123
#define RNGDS_MITG_DIS BIT(0) /* SRBDS support */
#define RTM_ALLOW BIT(1) /* TSX development mode */
+#define FB_CLEAR_DIS BIT(3) /* CPU Fill buffer clear disable */
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index acbaeaf83b61..da251a5645b0 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -269,6 +269,8 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+DECLARE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+
#include <asm/segment.h>
/**
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index f52a886d35cf..70533fdcbf02 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -69,6 +69,8 @@ void pcibios_scan_specific_bus(int busn);
/* pci-irq.c */
+struct pci_dev;
+
struct irq_info {
u8 bus, devfn; /* Bus, device and function */
struct {
@@ -246,3 +248,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
# define x86_default_pci_init_irq NULL
# define x86_default_pci_fixup_irqs NULL
#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_ACPI)
+extern bool pci_use_e820;
+#else
+#define pci_use_e820 false
+#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 7590ac2570b9..f8b9ee97a891 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -108,19 +108,16 @@ extern unsigned long _brk_end;
void *extend_brk(size_t size, size_t align);
/*
- * Reserve space in the brk section. The name must be unique within the file,
- * and somewhat descriptive. The size is in bytes.
+ * Reserve space in the .brk section, which is a block of memory from which the
+ * caller is allowed to allocate very early (before even memblock is available)
+ * by calling extend_brk(). All allocated memory will be eventually converted
+ * to memblock. Any leftover unallocated memory will be freed.
*
- * The allocation is done using inline asm (rather than using a section
- * attribute on a normal variable) in order to allow the use of @nobits, so
- * that it doesn't take up any space in the vmlinux file.
+ * The size is in bytes.
*/
-#define RESERVE_BRK(name, size) \
- asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \
- ".brk." #name ":\n\t" \
- ".skip " __stringify(size) "\n\t" \
- ".size .brk." #name ", " __stringify(size) "\n\t" \
- ".popsection\n\t")
+#define RESERVE_BRK(name, size) \
+ __section(".bss..brk") __aligned(1) __used \
+ static char __brk_##name[size]
extern void probe_roms(void);
#ifdef __i386__
@@ -133,12 +130,19 @@ asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */
-#else
-#define RESERVE_BRK(name,sz) \
- .pushsection .brk_reservation,"aw",@nobits; \
-.brk.name: \
-1: .skip sz; \
- .size .brk.name,.-1b; \
+
+#else /* __ASSEMBLY */
+
+.macro __RESERVE_BRK name, size
+ .pushsection .bss..brk, "aw"
+SYM_DATA_START(__brk_\name)
+ .skip \size
+SYM_DATA_END(__brk_\name)
.popsection
+.endm
+
+#define RESERVE_BRK(name, size) __RESERVE_BRK name, size
+
#endif /* __ASSEMBLY__ */
+
#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35f222aa66bf..913e593a3b45 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -439,7 +439,7 @@ do { \
[ptr] "+m" (*_ptr), \
[old] "+a" (__old) \
: [new] ltype (__new) \
- : "memory", "cc"); \
+ : "memory"); \
if (unlikely(__err)) \
goto label; \
if (unlikely(!success)) \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 03364dc40d8d..4c8b6ae802ac 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,10 +36,6 @@ KCSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD_test_nx.o := y
-ifdef CONFIG_FRAME_POINTER
-OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
-endif
-
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d879a6c93609..74c62cc47a5f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -41,8 +41,10 @@ static void __init spectre_v2_select_mitigation(void);
static void __init ssb_select_mitigation(void);
static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
-static void __init mds_print_mitigation(void);
+static void __init md_clear_update_mitigation(void);
+static void __init md_clear_select_mitigation(void);
static void __init taa_select_mitigation(void);
+static void __init mmio_select_mitigation(void);
static void __init srbds_select_mitigation(void);
static void __init l1d_flush_select_mitigation(void);
@@ -85,6 +87,10 @@ EXPORT_SYMBOL_GPL(mds_idle_clear);
*/
DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */
+DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear);
+EXPORT_SYMBOL_GPL(mmio_stale_data_clear);
+
void __init check_bugs(void)
{
identify_boot_cpu();
@@ -117,17 +123,10 @@ void __init check_bugs(void)
spectre_v2_select_mitigation();
ssb_select_mitigation();
l1tf_select_mitigation();
- mds_select_mitigation();
- taa_select_mitigation();
+ md_clear_select_mitigation();
srbds_select_mitigation();
l1d_flush_select_mitigation();
- /*
- * As MDS and TAA mitigations are inter-related, print MDS
- * mitigation until after TAA mitigation selection is done.
- */
- mds_print_mitigation();
-
arch_smt_update();
#ifdef CONFIG_X86_32
@@ -267,14 +266,6 @@ static void __init mds_select_mitigation(void)
}
}
-static void __init mds_print_mitigation(void)
-{
- if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off())
- return;
-
- pr_info("%s\n", mds_strings[mds_mitigation]);
-}
-
static int __init mds_cmdline(char *str)
{
if (!boot_cpu_has_bug(X86_BUG_MDS))
@@ -329,7 +320,7 @@ static void __init taa_select_mitigation(void)
/* TSX previously disabled by tsx=off */
if (!boot_cpu_has(X86_FEATURE_RTM)) {
taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
- goto out;
+ return;
}
if (cpu_mitigations_off()) {
@@ -343,7 +334,7 @@ static void __init taa_select_mitigation(void)
*/
if (taa_mitigation == TAA_MITIGATION_OFF &&
mds_mitigation == MDS_MITIGATION_OFF)
- goto out;
+ return;
if (boot_cpu_has(X86_FEATURE_MD_CLEAR))
taa_mitigation = TAA_MITIGATION_VERW;
@@ -375,18 +366,6 @@ static void __init taa_select_mitigation(void)
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
-
- /*
- * Update MDS mitigation, if necessary, as the mds_user_clear is
- * now enabled for TAA mitigation.
- */
- if (mds_mitigation == MDS_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MDS)) {
- mds_mitigation = MDS_MITIGATION_FULL;
- mds_select_mitigation();
- }
-out:
- pr_info("%s\n", taa_strings[taa_mitigation]);
}
static int __init tsx_async_abort_parse_cmdline(char *str)
@@ -411,6 +390,151 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ u64 ia32_cap;
+
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
+ cpu_mitigations_off()) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ ia32_cap = x86_read_arch_cap_msr();
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA. Otherwise, enable mitigation for VMM only.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
+ boot_cpu_has(X86_FEATURE_RTM)))
+ static_branch_enable(&mds_user_clear);
+ else
+ static_branch_enable(&mmio_stale_data_clear);
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&mds_idle_clear);
+
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(ia32_cap & ARCH_CAP_MDS_NO)))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
+
+ if (mmio_nosmt || cpu_mitigations_auto_nosmt())
+ cpu_smt_disable(false);
+}
+
+static int __init mmio_stale_data_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static void __init md_clear_update_mitigation(void)
+{
+ if (cpu_mitigations_off())
+ return;
+
+ if (!static_key_enabled(&mds_user_clear))
+ goto out;
+
+ /*
+ * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
+ * mitigation, if necessary.
+ */
+ if (mds_mitigation == MDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_select_mitigation();
+ }
+ if (taa_mitigation == TAA_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_select_mitigation();
+ }
+ if (mmio_mitigation == MMIO_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_select_mitigation();
+ }
+out:
+ if (boot_cpu_has_bug(X86_BUG_MDS))
+ pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ pr_info("TAA: %s\n", taa_strings[taa_mitigation]);
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init md_clear_select_mitigation(void)
+{
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+
+ /*
+ * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
+ * and print their mitigation after MDS, TAA and MMIO Stale Data
+ * mitigation selection is done.
+ */
+ md_clear_update_mitigation();
+}
+
+#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
@@ -478,11 +602,13 @@ static void __init srbds_select_mitigation(void)
return;
/*
- * Check to see if this is one of the MDS_NO systems supporting
- * TSX that are only exposed to SRBDS when TSX is enabled.
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
*/
ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
+ if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
@@ -1116,6 +1242,8 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
+ u64 ia32_cap = x86_read_arch_cap_msr();
+
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1127,14 +1255,17 @@ static void update_mds_branch_idle(void)
if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
return;
- if (sched_smt_active())
+ if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
- else
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (ia32_cap & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
+ }
}
#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
void cpu_bugs_smt_update(void)
{
@@ -1179,6 +1310,16 @@ void cpu_bugs_smt_update(void)
break;
}
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
mutex_unlock(&spec_ctrl_mutex);
}
@@ -1781,6 +1922,20 @@ static ssize_t tsx_async_abort_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t mmio_stale_data_show_state(char *buf)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
@@ -1881,6 +2036,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_SRBDS:
return srbds_show_state(buf);
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
default:
break;
}
@@ -1932,4 +2090,9 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *
{
return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c296cb1c0113..4730b0a58f24 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1211,18 +1211,42 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
X86_FEATURE_ANY, issues)
#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
+ BIT(7) | BIT(0xB), MMIO),
+ VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
+ VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
+ VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
+ VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
{}
};
@@ -1243,6 +1267,13 @@ u64 x86_read_arch_cap_msr(void)
return ia32_cap;
}
+static bool arch_cap_mmio_immune(u64 ia32_cap)
+{
+ return (ia32_cap & ARCH_CAP_FBSDP_NO &&
+ ia32_cap & ARCH_CAP_PSDP_NO &&
+ ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+}
+
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
u64 ia32_cap = x86_read_arch_cap_msr();
@@ -1296,12 +1327,27 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
/*
* SRBDS affects CPUs which support RDRAND or RDSEED and are listed
* in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
*/
if ((cpu_has(c, X86_FEATURE_RDRAND) ||
cpu_has(c, X86_FEATURE_RDSEED)) &&
- cpu_matches(cpu_vuln_blacklist, SRBDS))
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
setup_force_cpu_bug(X86_BUG_SRBDS);
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, MMIO) &&
+ !arch_cap_mmio_immune(ia32_cap))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 4ec13608d3c6..dfeb227de561 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -175,6 +175,7 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_caller);
+STACK_FRAME_NON_STANDARD_FP(ftrace_caller)
SYM_FUNC_START(ftrace_epilogue)
/*
@@ -282,6 +283,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
+STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -311,10 +313,14 @@ trace:
jmp ftrace_stub
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
+STACK_FRAME_NON_STANDARD_FP(__fentry__)
+
#endif /* CONFIG_DYNAMIC_FTRACE */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-SYM_FUNC_START(return_to_handler)
+SYM_CODE_START(return_to_handler)
+ UNWIND_HINT_EMPTY
+ ANNOTATE_NOENDBR
subq $16, %rsp
/* Save the return values */
@@ -339,7 +345,6 @@ SYM_FUNC_START(return_to_handler)
int3
.Ldo_rop:
mov %rdi, (%rsp)
- UNWIND_HINT_FUNC
RET
-SYM_FUNC_END(return_to_handler)
+SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index db2b350a37b7..bba1abd05bfe 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -1,7 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/dev_printk.h>
#include <linux/ioport.h>
+#include <linux/printk.h>
#include <asm/e820/api.h>
+#include <asm/pci_x86.h>
static void resource_clip(struct resource *res, resource_size_t start,
resource_size_t end)
@@ -24,14 +25,14 @@ static void resource_clip(struct resource *res, resource_size_t start,
res->start = end + 1;
}
-void remove_e820_regions(struct device *dev, struct resource *avail)
+static void remove_e820_regions(struct resource *avail)
{
int i;
struct e820_entry *entry;
u64 e820_start, e820_end;
struct resource orig = *avail;
- if (!(avail->flags & IORESOURCE_MEM))
+ if (!pci_use_e820)
return;
for (i = 0; i < e820_table->nr_entries; i++) {
@@ -41,7 +42,7 @@ void remove_e820_regions(struct device *dev, struct resource *avail)
resource_clip(avail, e820_start, e820_end);
if (orig.start != avail->start || orig.end != avail->end) {
- dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
+ pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
&orig, avail, e820_start, e820_end);
orig = *avail;
}
@@ -55,6 +56,9 @@ void arch_remove_reservations(struct resource *avail)
* the low 1MB unconditionally, as this area is needed for some ISA
* cards requiring a memory range, e.g. the i82365 PCMCIA controller.
*/
- if (avail->flags & IORESOURCE_MEM)
+ if (avail->flags & IORESOURCE_MEM) {
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3ebb85327edb..bd6c6fd373ae 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -67,11 +67,6 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif
-/*
- * Range of the BSS area. The size of the BSS area is determined
- * at link time, with RESERVE_BRK() facility reserving additional
- * chunks.
- */
unsigned long _brk_start = (unsigned long)__brk_base;
unsigned long _brk_end = (unsigned long)__brk_base;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f5f6dc2e8007..81aba718ecd5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -385,10 +385,10 @@ SECTIONS
__end_of_kernel_reserve = .;
. = ALIGN(PAGE_SIZE);
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ .brk (NOLOAD) : AT(ADDR(.brk) - LOAD_OFFSET) {
__brk_base = .;
. += 64 * 1024; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
+ *(.bss..brk) /* areas brk users have reserved */
__brk_limit = .;
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f1bdac3f5aa8..0e68b4c937fc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2039,6 +2039,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
+static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
+{
+ struct kvm *kvm = apic->vcpu->kvm;
+
+ if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
+ return;
+
+ if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
+ return;
+
+ kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+}
+
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -2047,10 +2060,12 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- else
+ kvm_lapic_xapic_id_updated(apic);
+ } else {
ret = 1;
+ }
break;
case APIC_TASKPRI:
@@ -2336,8 +2351,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
MSR_IA32_APICBASE_BASE;
if ((value & MSR_IA32_APICBASE_ENABLE) &&
- apic->base_address != APIC_DEFAULT_PHYS_BASE)
- pr_warn_once("APIC base relocation is unsupported by KVM");
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
}
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
@@ -2648,6 +2665,8 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
+ } else {
+ kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f4653688fa6d..17252f39bd7c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3411,7 +3411,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
mmu->pae_root[i] = root | PT_PRESENT_MASK |
- shadow_me_mask;
+ shadow_me_value;
}
mmu->root.hpa = __pa(mmu->pae_root);
} else {
@@ -5179,7 +5179,7 @@ static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
roots_to_free |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
- if (is_obsolete_root(kvm, mmu->root.hpa))
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 6d3b3e5a5533..ee4802d7b36c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -146,6 +146,15 @@ static bool try_step_up(struct tdp_iter *iter)
}
/*
+ * Step the iterator back up a level in the paging structure. Should only be
+ * used when the iterator is below the root level.
+ */
+void tdp_iter_step_up(struct tdp_iter *iter)
+{
+ WARN_ON(!try_step_up(iter));
+}
+
+/*
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..adfca0cf94d3 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_up(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 841feaa48be5..7b9265d67131 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
+ int max_mapping_level;
kvm_pfn_t pfn;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
-retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1755,15 +1755,41 @@ retry:
!is_last_spte(iter.old_spte, iter.level))
continue;
+ /*
+ * This is a leaf SPTE. Check if the PFN it maps can
+ * be mapped at a higher level.
+ */
pfn = spte_to_pfn(iter.old_spte);
- if (kvm_is_reserved_pfn(pfn) ||
- iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
- pfn, PG_LEVEL_NUM))
+
+ if (kvm_is_reserved_pfn(pfn))
continue;
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+ iter.gfn, pfn, PG_LEVEL_NUM);
+
+ WARN_ON(max_mapping_level < iter.level);
+
+ /*
+ * If this page is already mapped at the highest
+ * viable level, there's nothing more to do.
+ */
+ if (max_mapping_level == iter.level)
+ continue;
+
+ /*
+ * The page can be remapped at a higher level, so step
+ * up to zap the parent SPTE.
+ */
+ while (max_mapping_level > iter.level)
+ tdp_iter_step_up(&iter);
+
/* Note, a successful atomic zap also does a remote TLB flush. */
- if (tdp_mmu_zap_spte_atomic(kvm, &iter))
- goto retry;
+ tdp_mmu_zap_spte_atomic(kvm, &iter);
+
+ /*
+ * If the atomic zap fails, the iter will recurse back into
+ * the same subtree to retry.
+ */
}
rcu_read_unlock();
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 54fe03714f8a..d1bc5820ea46 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -291,58 +291,91 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 dest, apic_id;
- struct kvm_vcpu *vcpu;
+ u32 l1_physical_id, dest;
+ struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
- /*
- * The AVIC incomplete IPI #vmexit info provides index into
- * the physical APIC ID table, which can be used to derive
- * guest physical APIC ID.
- */
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_APIC_DEST_FIELD(icrh);
+
if (dest_mode == APIC_DEST_PHYSICAL) {
- apic_id = index;
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ l1_physical_id = dest;
+
+ if (WARN_ON_ONCE(l1_physical_id != index))
+ return -EINVAL;
+
} else {
- if (!apic_x2apic_mode(source)) {
- /* For xAPIC logical mode, the index is for logical APIC table. */
- apic_id = avic_logical_id_table[index] & 0x1ff;
+ u32 bitmap, cluster;
+ int logid_index;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF0000;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
} else {
- return -EINVAL;
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
}
- }
- /*
- * Assuming vcpu ID is the same as physical apic ID,
- * and use it to retrieve the target vCPU.
- */
- vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
- if (!vcpu)
- return -EINVAL;
+ if (unlikely(!bitmap))
+ /* guest bug: nobody to send the logical interrupt to */
+ return 0;
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_APIC_DEST_FIELD(icrh);
+ if (!is_power_of_2(bitmap))
+ /* multiple logical destinations, use slow path */
+ return -EINVAL;
- /*
- * Try matching the destination APIC ID with the vCPU.
- */
- if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- return 0;
+ logid_index = cluster + __ffs(bitmap);
+
+ if (apic_x2apic_mode(source)) {
+ l1_physical_id = logid_index;
+ } else {
+ u32 *avic_logical_id_table =
+ page_address(kvm_svm->avic_logical_id_table_page);
+
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ if (WARN_ON_ONCE(index != logid_index))
+ return -EINVAL;
+
+ /* guest bug: non existing/reserved logical destination */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return 0;
+
+ l1_physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ }
}
- return -EINVAL;
+ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+ if (unlikely(!target_vcpu))
+ /* guest bug: non existing vCPU is a target of this IPI*/
+ return 0;
+
+ target_vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(target_vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
@@ -508,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
return ret;
}
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
- u64 *old, *new;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (vcpu->vcpu_id == id)
- return 0;
-
- old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
- new = avic_get_physical_id_entry(vcpu, id);
- if (!new || !old)
- return 1;
-
- /* We need to move physical_id_entry to new offset */
- *new = *old;
- *old = 0ULL;
- to_svm(vcpu)->avic_physical_id_cache = new;
-
- /*
- * Also update the guest physical APIC ID in the logical
- * APIC ID table entry if already setup the LDR.
- */
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
-
- return 0;
-}
-
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -555,10 +559,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
- case APIC_ID:
- if (avic_handle_apic_id_update(vcpu))
- return 0;
- break;
case APIC_LDR:
if (avic_handle_ldr_update(vcpu))
return 0;
@@ -650,8 +650,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
- if (avic_handle_apic_id_update(vcpu) != 0)
- return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -910,7 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_X2APIC) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV);
+ BIT(APICV_INHIBIT_REASON_SEV) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
@@ -946,7 +946,7 @@ out:
return ret;
}
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
@@ -978,7 +978,7 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
-void __avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
@@ -997,25 +997,6 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-static void avic_vcpu_load(struct kvm_vcpu *vcpu)
-{
- int cpu = get_cpu();
-
- WARN_ON(cpu != vcpu->cpu);
-
- __avic_vcpu_load(vcpu, cpu);
-
- put_cpu();
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
-
- __avic_vcpu_put(vcpu);
-
- preempt_enable();
-}
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
@@ -1042,7 +1023,7 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
vmcb_mark_dirty(vmcb, VMCB_AVIC);
if (activated)
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
@@ -1075,5 +1056,5 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bed5e1692cef..ba7cd26f438f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
@@ -671,27 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+ pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
- /* use guest values since host doesn't use them */
- vmcb02->control.pause_filter_count =
- svm->pause_filter_enabled ?
- svm->nested.ctl.pause_filter_count : 0;
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
- vmcb02->control.pause_filter_thresh =
- svm->pause_threshold_enabled ?
- svm->nested.ctl.pause_filter_thresh : 0;
-
- } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
- /* use host values when guest doesn't use them */
+ } else {
+ /* start from host values otherwise */
vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
- } else {
- /*
- * Intercept every PAUSE otherwise and
- * ignore both host and guest values
- */
- vmcb02->control.pause_filter_count = 0;
- vmcb02->control.pause_filter_thresh = 0;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
}
nested_svm_transition_tlb_flush(vcpu);
@@ -951,8 +951,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
@@ -982,7 +985,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
- svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
}
svm->nested.ctl.nested_cr3 = 0;
@@ -1387,7 +1390,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
vcpu->arch.tsc_scaling_ratio =
kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
svm->tsc_ratio_msr);
- svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
}
/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 200045f71df0..87da90360bc7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -465,11 +465,24 @@ static int has_svm(void)
return 1;
}
+void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ preempt_disable();
+
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ goto out;
+
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+out:
+ preempt_enable();
+}
+
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
- wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -515,8 +528,7 @@ static int svm_hardware_enable(void)
* Set the default value, even if we don't use TSC scaling
* to avoid having stale value in the msr
*/
- wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
- __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT);
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
}
@@ -909,7 +921,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count = __grow_ple_window(old,
@@ -930,7 +942,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count =
@@ -999,11 +1011,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
{
- wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __svm_write_tsc_multiplier(multiplier);
}
+
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
@@ -1363,13 +1376,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
sev_es_prepare_switch_to_guest(hostsa);
}
- if (tsc_scaling) {
- u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
- if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
- }
- }
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
if (likely(tsc_aux_uret_slot >= 0))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
@@ -1392,13 +1400,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_load(vcpu, cpu);
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_put(vcpu);
+ avic_vcpu_put(vcpu);
svm_prepare_host_switch(vcpu);
@@ -4255,6 +4263,8 @@ out:
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ vcpu->arch.at_instruction_boundary = true;
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 21c5460e947a..1bddd336a27e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -590,7 +590,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
+void __svm_write_tsc_multiplier(u64 multiplier);
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
@@ -610,8 +610,8 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a07e8cd753ec..3a919e49129b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -229,6 +229,9 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -360,6 +363,60 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr |= FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
@@ -2252,6 +2309,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
@@ -4553,6 +4614,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
vpid_sync_context(vmx->vpid);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -6547,6 +6610,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
return;
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ vcpu->arch.at_instruction_boundary = true;
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6771,6 +6835,11 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
@@ -6780,6 +6849,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
+ vmx_enable_fb_clear(vmx);
+
guest_state_exit_irqoff();
}
@@ -7708,7 +7779,9 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
@@ -8211,6 +8284,8 @@ static int __init vmx_init(void)
return r;
}
+ vmx_setup_fb_clear_ctrl();
+
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b98c7e96697a..8d2342ede0c5 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -348,6 +348,8 @@ struct vcpu_vmx {
u64 msr_ia32_feature_control_valid_bits;
/* SGX Launch Control public key hash */
u64 msr_ia32_sgxlepubkeyhash[4];
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e9473c7c7390..1910e1e78b15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -296,6 +296,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, nested_run),
STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_COUNTER(VCPU, preemption_reported),
+ STATS_DESC_COUNTER(VCPU, preemption_other),
STATS_DESC_ICOUNTER(VCPU, guest_mode)
};
@@ -1615,6 +1617,9 @@ static u64 kvm_get_arch_capabilities(void)
*/
}
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
return data;
}
@@ -4625,6 +4630,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
struct kvm_memslots *slots;
static const u8 preempted = KVM_VCPU_PREEMPTED;
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4654,19 +4672,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ if (vcpu->preempted) {
+ if (!vcpu->arch.guest_state_protected)
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (kvm_xen_msr_enabled(vcpu->kvm))
- kvm_xen_runstate_set_preempted(vcpu);
- else
- kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -9833,6 +9853,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
return;
down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
activate = kvm_vcpu_apicv_activated(vcpu);
@@ -9853,6 +9874,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
+ preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -10422,6 +10444,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index ee5c4ae0755c..532a535a9e99 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -159,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
* behalf of the vCPU. Only if the VMM does actually block
* does it need to enter RUNSTATE_blocked.
*/
- if (vcpu->preempted)
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ if (WARN_ON_ONCE(!vcpu->preempted))
+ return;
+
+ kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
}
/* 32-bit compatibility definitions, also used natively in 32-bit build */
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 11350e2fd736..9f27e14e185f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -12,7 +12,6 @@
#include <linux/swiotlb.h>
#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/virtio_config.h>
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -87,9 +86,3 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
-
-int arch_has_restricted_virtio_memory_access(void)
-{
- return cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT);
-}
-EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access);
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index e8f7953fda83..f6d038e2cd8e 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -21,6 +21,7 @@
#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
#include <linux/cc_platform.h>
+#include <linux/platform-feature.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -242,6 +243,9 @@ void __init sev_setup_arch(void)
size = total_mem * 6 / 100;
size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
swiotlb_adjust_size(size);
+
+ /* Set restricted memory access for virtio. */
+ platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS);
}
static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index a4f43054bc79..2f82480fd430 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -8,7 +8,6 @@
#include <linux/pci-acpi.h>
#include <asm/numa.h>
#include <asm/pci_x86.h>
-#include <asm/e820/api.h>
struct pci_root_info {
struct acpi_pci_root_info common;
@@ -20,7 +19,7 @@ struct pci_root_info {
#endif
};
-static bool pci_use_e820 = true;
+bool pci_use_e820 = true;
static bool pci_use_crs = true;
static bool pci_ignore_seg;
@@ -387,11 +386,6 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
status = acpi_pci_probe_root_resources(ci);
- if (pci_use_e820) {
- resource_list_for_each_entry(entry, &ci->resources)
- remove_e820_regions(&device->dev, entry->res);
- }
-
if (pci_use_crs) {
resource_list_for_each_entry_safe(entry, tmp, &ci->resources)
if (resource_is_pcicfg_ioport(entry->res))
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 517a9d8d8f94..8b71b1dd7639 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -195,6 +195,8 @@ static void __init xen_hvm_guest_init(void)
if (xen_pv_domain())
return;
+ xen_set_restricted_virtio_memory_access();
+
init_hvm_pv_info();
reserve_shared_info();
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f33a4421e7cd..e3297b15701c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -109,6 +109,8 @@ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
static void __init xen_pv_init_platform(void)
{
+ xen_set_restricted_virtio_memory_access();
+
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);