aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig49
-rw-r--r--arch/x86/Kconfig.cpu4
-rw-r--r--arch/x86/Makefile16
-rw-r--r--arch/x86/boot/compressed/Makefile19
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S16
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S6
-rw-r--r--arch/x86/boot/compressed/sev.c6
-rw-r--r--arch/x86/boot/string.h3
-rw-r--r--arch/x86/configs/i386_defconfig2
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S48
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S56
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S40
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c4
-rw-r--r--arch/x86/crypto/blake2s-core.S4
-rw-r--r--arch/x86/crypto/blake2s-glue.c68
-rw-r--r--arch/x86/crypto/blake2s-shash.c77
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S14
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S14
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S12
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S6
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S8
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S2
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S2
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c767
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S4
-rw-r--r--arch/x86/crypto/des3_ede_glue.c4
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S6
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S2
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S2
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S10
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S6
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S2
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S2
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S2
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S2
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S12
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S8
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S10
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S4
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S6
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S4
-rw-r--r--arch/x86/entry/entry_32.S43
-rw-r--r--arch/x86/entry/entry_64.S62
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/thunk_32.S2
-rw-r--r--arch/x86/entry/thunk_64.S2
-rw-r--r--arch/x86/entry/vdso/Makefile2
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S2
-rw-r--r--arch/x86/entry/vdso/vsgx.S2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S6
-rw-r--r--arch/x86/events/amd/iommu.c2
-rw-r--r--arch/x86/events/core.c23
-rw-r--r--arch/x86/events/intel/core.c20
-rw-r--r--arch/x86/events/intel/lbr.c168
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore.h3
-rw-r--r--arch/x86/events/intel/uncore_discovery.c4
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_snb.c214
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--arch/x86/events/perf_event.h12
-rw-r--r--arch/x86/events/rapl.c9
-rw-r--r--arch/x86/hyperv/hv_init.c14
-rw-r--r--arch/x86/hyperv/irqdomain.c55
-rw-r--r--arch/x86/hyperv/ivm.c28
-rw-r--r--arch/x86/hyperv/mmu.c19
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/asm.h37
-rw-r--r--arch/x86/include/asm/barrier.h10
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/efi.h16
-rw-r--r--arch/x86/include/asm/extable.h6
-rw-r--r--arch/x86/include/asm/extable_fixup_types.h59
-rw-r--r--arch/x86/include/asm/fpu/api.h17
-rw-r--r--arch/x86/include/asm/fpu/signal.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h32
-rw-r--r--arch/x86/include/asm/futex.h28
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h33
-rw-r--r--arch/x86/include/asm/insn-eval.h15
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/io.h20
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h94
-rw-r--r--arch/x86/include/asm/kvm_page_track.h6
-rw-r--r--arch/x86/include/asm/linkage.h14
-rw-r--r--arch/x86/include/asm/mce.h28
-rw-r--r--arch/x86/include/asm/mmx.h15
-rw-r--r--arch/x86/include/asm/mshyperv.h9
-rw-r--r--arch/x86/include/asm/msr-index.h17
-rw-r--r--arch/x86/include/asm/msr.h26
-rw-r--r--arch/x86/include/asm/mtrr.h8
-rw-r--r--arch/x86/include/asm/page_32.h14
-rw-r--r--arch/x86/include/asm/page_64.h1
-rw-r--r--arch/x86/include/asm/paravirt.h7
-rw-r--r--arch/x86/include/asm/pgtable.h32
-rw-r--r--arch/x86/include/asm/pkru.h4
-rw-r--r--arch/x86/include/asm/processor.h8
-rw-r--r--arch/x86/include/asm/qspinlock.h1
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h4
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/required-features.h4
-rw-r--r--arch/x86/include/asm/segment.h9
-rw-r--r--arch/x86/include/asm/set_memory.h4
-rw-r--r--arch/x86/include/asm/sev-common.h66
-rw-r--r--arch/x86/include/asm/sgx.h18
-rw-r--r--arch/x86/include/asm/static_call.h2
-rw-r--r--arch/x86/include/asm/string_32.h33
-rw-r--r--arch/x86/include/asm/tlbflush.h5
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/uaccess.h44
-rw-r--r--arch/x86/include/asm/word-at-a-time.h66
-rw-r--r--arch/x86/include/asm/x86_init.h6
-rw-r--r--arch/x86/include/asm/xen/hypercall.h4
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h9
-rw-r--r--arch/x86/include/asm/xen/page.h14
-rw-r--r--arch/x86/include/uapi/asm/kvm.h19
-rw-r--r--arch/x86/include/uapi/asm/prctl.h26
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S6
-rw-r--r--arch/x86/kernel/alternative.c51
-rw-r--r--arch/x86/kernel/amd_nb.c54
-rw-r--r--arch/x86/kernel/apic/msi.c11
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/cc_platform.c16
-rw-r--r--arch/x86/kernel/cpu/common.c17
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c45
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c282
-rw-r--r--arch/x86/kernel/cpu/mce/core.c149
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c46
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c41
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c17
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h36
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c162
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h8
-rw-r--r--arch/x86/kernel/dumpstack.c4
-rw-r--r--arch/x86/kernel/early-quirks.c10
-rw-r--r--arch/x86/kernel/fpu/core.c120
-rw-r--r--arch/x86/kernel/fpu/legacy.h6
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c147
-rw-r--r--arch/x86/kernel/fpu/xstate.h25
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/ftrace_32.S6
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/head64.c74
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/head_64.S19
-rw-r--r--arch/x86/kernel/hpet.c8
-rw-r--r--arch/x86/kernel/irqflags.S2
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/module.c7
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process.h4
-rw-r--r--arch/x86/kernel/reboot.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S10
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S10
-rw-r--r--arch/x86/kernel/resource.c23
-rw-r--r--arch/x86/kernel/setup.c79
-rw-r--r--arch/x86/kernel/setup_percpu.c66
-rw-r--r--arch/x86/kernel/sev-shared.c2
-rw-r--r--arch/x86/kernel/sev.c240
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/kernel/smpboot.c14
-rw-r--r--arch/x86/kernel/static_call.c5
-rw-r--r--arch/x86/kernel/tsc.c28
-rw-r--r--arch/x86/kernel/tsc_sync.c41
-rw-r--r--arch/x86/kernel/verify_cpu.S4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kernel/x86_init.c12
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/cpuid.c243
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/debugfs.c9
-rw-r--r--arch/x86/kvm/emulate.c71
-rw-r--r--arch/x86/kvm/hyperv.c16
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c5
-rw-r--r--arch/x86/kvm/ioapic.c4
-rw-r--r--arch/x86/kvm/ioapic.h1
-rw-r--r--arch/x86/kvm/irq.h1
-rw-r--r--arch/x86/kvm/irq_comm.c19
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h20
-rw-r--r--arch/x86/kvm/kvm_emulate.h1
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c3
-rw-r--r--arch/x86/kvm/lapic.c69
-rw-r--r--arch/x86/kvm/mmu.h16
-rw-r--r--arch/x86/kvm/mmu/mmu.c316
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h9
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h2
-rw-r--r--arch/x86/kvm/mmu/page_track.c8
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h51
-rw-r--r--arch/x86/kvm/mmu/spte.c8
-rw-r--r--arch/x86/kvm/mmu/spte.h44
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c6
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c75
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h5
-rw-r--r--arch/x86/kvm/pmu.c161
-rw-r--r--arch/x86/kvm/pmu.h5
-rw-r--r--arch/x86/kvm/svm/avic.c134
-rw-r--r--arch/x86/kvm/svm/nested.c271
-rw-r--r--arch/x86/kvm/svm/pmu.c25
-rw-r--r--arch/x86/kvm/svm/sev.c281
-rw-r--r--arch/x86/kvm/svm/svm.c729
-rw-r--r--arch/x86/kvm/svm/svm.h94
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h12
-rw-r--r--arch/x86/kvm/svm/vmenter.S4
-rw-r--r--arch/x86/kvm/trace.h24
-rw-r--r--arch/x86/kvm/vmx/capabilities.h14
-rw-r--r--arch/x86/kvm/vmx/evmcs.c4
-rw-r--r--arch/x86/kvm/vmx/evmcs.h48
-rw-r--r--arch/x86/kvm/vmx/nested.c200
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c63
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c278
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h14
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S14
-rw-r--r--arch/x86/kvm/vmx/vmx.c482
-rw-r--r--arch/x86/kvm/vmx/vmx.h47
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h43
-rw-r--r--arch/x86/kvm/x86.c683
-rw-r--r--arch/x86/kvm/x86.h26
-rw-r--r--arch/x86/kvm/xen.c335
-rw-r--r--arch/x86/kvm/xen.h9
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S86
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S16
-rw-r--r--arch/x86/lib/checksum_32.S27
-rw-r--r--arch/x86/lib/clear_page_64.S6
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S4
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S4
-rw-r--r--arch/x86/lib/copy_mc_64.S18
-rw-r--r--arch/x86/lib/copy_page_64.S4
-rw-r--r--arch/x86/lib/copy_user_64.S55
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/csum-partial_64.c183
-rw-r--r--arch/x86/lib/error-inject.c3
-rw-r--r--arch/x86/lib/getuser.S22
-rw-r--r--arch/x86/lib/hweight.S6
-rw-r--r--arch/x86/lib/insn-eval.c180
-rw-r--r--arch/x86/lib/iomap_copy_64.S2
-rw-r--r--arch/x86/lib/memcpy_32.c4
-rw-r--r--arch/x86/lib/memcpy_64.S12
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S6
-rw-r--r--arch/x86/lib/mmx_32.c388
-rw-r--r--arch/x86/lib/msr-reg.S4
-rw-r--r--arch/x86/lib/putuser.S6
-rw-r--r--arch/x86/lib/retpoline.S4
-rw-r--r--arch/x86/lib/usercopy_32.c67
-rw-r--r--arch/x86/lib/usercopy_64.c8
-rw-r--r--arch/x86/math-emu/div_Xsig.S2
-rw-r--r--arch/x86/math-emu/div_small.S2
-rw-r--r--arch/x86/math-emu/mul_Xsig.S6
-rw-r--r--arch/x86/math-emu/polynom_Xsig.S2
-rw-r--r--arch/x86/math-emu/reg_norm.S6
-rw-r--r--arch/x86/math-emu/reg_round.S2
-rw-r--r--arch/x86/math-emu/reg_u_add.S2
-rw-r--r--arch/x86/math-emu/reg_u_div.S2
-rw-r--r--arch/x86/math-emu/reg_u_mul.S2
-rw-r--r--arch/x86/math-emu/reg_u_sub.S2
-rw-r--r--arch/x86/math-emu/round_Xsig.S4
-rw-r--r--arch/x86/math-emu/shr_Xsig.S8
-rw-r--r--arch/x86/math-emu/wm_shrx.S16
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/extable.c113
-rw-r--r--arch/x86/mm/fault.c3
-rw-r--r--arch/x86/mm/init.c5
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/mm/mem_encrypt.c441
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c438
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S4
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c122
-rw-r--r--arch/x86/net/bpf_jit_comp32.c4
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/pci/xen.c38
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts4
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S2
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S2
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S16
-rw-r--r--arch/x86/platform/efi/quirks.c3
-rw-r--r--arch/x86/platform/olpc/xo1-wakeup.S6
-rw-r--r--arch/x86/power/hibernate_asm_32.S4
-rw-r--r--arch/x86/power/hibernate_asm_64.S4
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/realmode/init.c38
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/um/Kconfig1
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/barrier.h1
-rw-r--r--arch/x86/um/asm/segment.h8
-rw-r--r--arch/x86/um/checksum_32.S4
-rw-r--r--arch/x86/um/os-Linux/registers.c1
-rw-r--r--arch/x86/um/ptrace_32.c1
-rw-r--r--arch/x86/um/ptrace_64.c1
-rw-r--r--arch/x86/um/setjmp_32.S2
-rw-r--r--arch/x86/um/setjmp_64.S2
-rw-r--r--arch/x86/um/shared/sysdep/syscalls_64.h3
-rw-r--r--arch/x86/um/signal.c1
-rw-r--r--arch/x86/um/sys_call_table_32.c4
-rw-r--r--arch/x86/um/sys_call_table_64.c17
-rw-r--r--arch/x86/um/syscalls_64.c14
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/pmu.c32
-rw-r--r--arch/x86/xen/vga.c12
-rw-r--r--arch/x86/xen/xen-asm.S32
-rw-r--r--arch/x86/xen/xen-head.S2
341 files changed, 7485 insertions, 5158 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7399327d1eff..9f5bd41bf660 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -78,7 +78,7 @@ config X86
select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION
+ select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
@@ -104,6 +104,7 @@ config X86
select ARCH_SUPPORTS_ACPI
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
select ARCH_SUPPORTS_LTO_CLANG
@@ -136,7 +137,6 @@ config X86
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
- select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
@@ -186,6 +186,7 @@ config X86
select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION
+ select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
@@ -239,6 +240,7 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_SETUP_PER_CPU_AREA
select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if X86_64
@@ -252,6 +254,8 @@ config X86
select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
+ select NEED_PER_CPU_PAGE_FIRST_CHUNK
select NEED_SG_DMA_LENGTH
select PCI_DOMAINS if PCI
select PCI_LOCKLESS_CONFIG if PCI
@@ -269,6 +273,7 @@ config X86
select HAVE_ARCH_KCSAN if X86_64
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
+ select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
config INSTRUCTION_DECODER
@@ -331,15 +336,6 @@ config ARCH_HAS_CPU_RELAX
config ARCH_HAS_FILTER_PGPROT
def_bool y
-config HAVE_SETUP_PER_CPU_AREA
- def_bool y
-
-config NEED_PER_CPU_EMBED_FIRST_CHUNK
- def_bool y
-
-config NEED_PER_CPU_PAGE_FIRST_CHUNK
- def_bool y
-
config ARCH_HIBERNATION_POSSIBLE
def_bool y
@@ -472,6 +468,18 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
+config CC_HAS_SLS
+ def_bool $(cc-option,-mharden-sls=all)
+
+config SLS
+ bool "Mitigate Straight-Line-Speculation"
+ depends on CC_HAS_SLS && X86_64
+ default n
+ help
+ Compile the kernel with straight-line-speculation options to guard
+ against straight line speculation. The kernel image might be slightly
+ larger.
+
config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
@@ -1523,16 +1531,20 @@ config X86_CPA_STATISTICS
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
+config X86_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select DYNAMIC_PHYSICAL_MASK
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+ def_bool n
+
config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DMA_COHERENT_POOL
- select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
- select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
- select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
select ARCH_HAS_CC_PLATFORM
+ select X86_MEM_ENCRYPT
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1557,6 +1569,7 @@ config NUMA
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
default y if X86_BIGSMP
+ select USE_PERCPU_NUMA_NODE_ID
help
Enable NUMA (Non-Uniform Memory Access) support.
@@ -1917,6 +1930,7 @@ config X86_SGX
select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
+ select XARRAY_MULTI
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
@@ -1932,6 +1946,7 @@ config EFI
depends on ACPI
select UCS2_STRING
select EFI_RUNTIME_WRAPPERS
+ select ARCH_USE_MEMREMAP_PROT
help
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1945,7 +1960,7 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI && !X86_USE_3DNOW
+ depends on EFI
depends on $(cc-option,-mabi=ms) || X86_32
select RELOCATABLE
help
@@ -2430,10 +2445,6 @@ config ARCH_HAS_ADD_PAGES
config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
def_bool y
-config USE_PERCPU_NUMA_NODE_ID
- def_bool y
- depends on NUMA
-
menu "Power management and ACPI options"
config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index eefc434351db..542377cd419d 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -342,10 +342,6 @@ config X86_USE_PPRO_CHECKSUM
def_bool y
depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
-config X86_USE_3DNOW
- def_bool y
- depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML
-
#
# P6_NOPs are a relatively minor optimization that require a family >=
# 6 processor, except that it is broken on certain VIA chips.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 42243869216d..e84cdd409b64 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -12,6 +12,18 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
+ifdef CONFIG_CC_IS_GCC
+RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
+RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
+endif
+ifdef CONFIG_CC_IS_CLANG
+RETPOLINE_CFLAGS := -mretpoline-external-thunk
+RETPOLINE_VDSO_CFLAGS := -mretpoline
+endif
+export RETPOLINE_CFLAGS
+export RETPOLINE_VDSO_CFLAGS
+
# For gcc stack alignment is specified with -mpreferred-stack-boundary,
# clang has the option -mstack-alignment for that purpose.
ifneq ($(call cc-option, -mpreferred-stack-boundary=4),)
@@ -179,6 +191,10 @@ ifdef CONFIG_RETPOLINE
endif
endif
+ifdef CONFIG_SLS
+ KBUILD_CFLAGS += -mharden-sls=all
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
ifdef CONFIG_LTO_CLANG
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 431bf7f846c3..6115274fe10f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,7 +28,11 @@ KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst
-KBUILD_CFLAGS := -m$(BITS) -O2
+# CLANG_FLAGS must come before any cc-disable-warning or cc-option calls in
+# case of cross compiling, as it has the '--target=' flag, which is needed to
+# avoid errors with '-march=i386', and future flags may depend on the target to
+# be valid.
+KBUILD_CFLAGS := -m$(BITS) -O2 $(CLANG_FLAGS)
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -Wundef
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
@@ -47,7 +51,6 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
# Disable relocation relaxation in case the link is not PIE.
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
-KBUILD_CFLAGS += $(CLANG_FLAGS)
# sev.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
@@ -123,17 +126,17 @@ vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,bzip2)
+ $(call if_changed,bzip2_with_size)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzma)
+ $(call if_changed,lzma_with_size)
$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,xzkern)
+ $(call if_changed,xzkern_with_size)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lzo)
+ $(call if_changed,lzo_with_size)
$(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,lz4)
+ $(call if_changed,lz4_with_size)
$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,zstd22)
+ $(call if_changed,zstd22_with_size)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
index 8bb92e9f4e97..67e7edcdfea8 100644
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -26,8 +26,6 @@ SYM_FUNC_START(__efi64_thunk)
push %rbp
push %rbx
- leaq 1f(%rip), %rbp
-
movl %ds, %eax
push %rax
movl %es, %eax
@@ -35,6 +33,11 @@ SYM_FUNC_START(__efi64_thunk)
movl %ss, %eax
push %rax
+ /* Copy args passed on stack */
+ movq 0x30(%rsp), %rbp
+ movq 0x38(%rsp), %rbx
+ movq 0x40(%rsp), %rax
+
/*
* Convert x86-64 ABI params to i386 ABI
*/
@@ -44,13 +47,18 @@ SYM_FUNC_START(__efi64_thunk)
movl %ecx, 0x8(%rsp)
movl %r8d, 0xc(%rsp)
movl %r9d, 0x10(%rsp)
+ movl %ebp, 0x14(%rsp)
+ movl %ebx, 0x18(%rsp)
+ movl %eax, 0x1c(%rsp)
- leaq 0x14(%rsp), %rbx
+ leaq 0x20(%rsp), %rbx
sgdt (%rbx)
addq $16, %rbx
sidt (%rbx)
+ leaq 1f(%rip), %rbp
+
/*
* Switch to IDT and GDT with 32-bit segments. This is the firmware GDT
* and IDT that was installed when the kernel started executing. The
@@ -93,7 +101,7 @@ SYM_FUNC_START(__efi64_thunk)
pop %rbx
pop %rbp
- ret
+ RET
SYM_FUNC_END(__efi64_thunk)
.code32
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 572c535cf45b..fd9441f40457 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -813,7 +813,7 @@ SYM_FUNC_START(efi32_pe_entry)
2: popl %edi // restore callee-save registers
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi32_pe_entry)
.section ".rodata"
@@ -868,7 +868,7 @@ SYM_FUNC_START(startup32_set_idt_entry)
pop %ecx
pop %ebx
- ret
+ RET
SYM_FUNC_END(startup32_set_idt_entry)
#endif
@@ -884,7 +884,7 @@ SYM_FUNC_START(startup32_load_idt)
movl %eax, rva(boot32_idt_desc+2)(%ebp)
lidt rva(boot32_idt_desc)(%ebp)
#endif
- ret
+ RET
SYM_FUNC_END(startup32_load_idt)
/*
@@ -954,7 +954,7 @@ SYM_FUNC_START(startup32_check_sev_cbit)
popl %ebx
popl %eax
#endif
- ret
+ RET
SYM_FUNC_END(startup32_check_sev_cbit)
/*
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index c1e81a848b2a..a63424d13627 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -58,7 +58,7 @@ SYM_FUNC_START(get_sev_encryption_bit)
#endif /* CONFIG_AMD_MEM_ENCRYPT */
- ret
+ RET
SYM_FUNC_END(get_sev_encryption_bit)
/**
@@ -92,7 +92,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid)
/* All good - return success */
xorl %eax, %eax
1:
- ret
+ RET
2:
movl $-1, %eax
jmp 1b
@@ -221,7 +221,7 @@ SYM_FUNC_START(set_sev_encryption_mask)
#endif
xor %rax, %rax
- ret
+ RET
SYM_FUNC_END(set_sev_encryption_mask)
.data
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index 670e998fe930..28bcf04c022e 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -122,7 +122,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
static bool early_setup_sev_es(void)
{
if (!sev_es_negotiate_protocol())
- sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
+ sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED);
if (set_page_decrypted((unsigned long)&boot_ghcb_page))
return false;
@@ -175,7 +175,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
enum es_result result;
if (!boot_ghcb && !early_setup_sev_es())
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
result = vc_init_em_ctxt(&ctxt, regs, exit_code);
@@ -202,5 +202,5 @@ finish:
if (result == ES_OK)
vc_finish_insn(&ctxt);
else if (result != ES_RETRY)
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index a232da487cd2..e5d2c6b8c2f1 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -8,8 +8,10 @@
#undef memcmp
void *memcpy(void *dst, const void *src, size_t len);
+void *memmove(void *dst, const void *src, size_t len);
void *memset(void *dst, int c, size_t len);
int memcmp(const void *s1, const void *s2, size_t len);
+int bcmp(const void *s1, const void *s2, size_t len);
/* Access builtin version by default. */
#define memcpy(d,s,l) __builtin_memcpy(d,s,l)
@@ -25,6 +27,7 @@ extern size_t strnlen(const char *s, size_t maxlen);
extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
unsigned int base);
+long simple_strtol(const char *cp, char **endp, unsigned int base);
int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
int boot_kstrtoul(const char *s, unsigned int base, unsigned long *res);
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index e81885384f60..71124cf8630c 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
@@ -262,3 +261,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index e8a7a0af2bda..92b1169ec90b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_AUDIT=y
@@ -258,3 +257,4 @@ CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_BOOT_PARAMS=y
+CONFIG_KALLSYMS_ALL=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index f307c93fc90a..c3af959648e6 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -62,7 +62,9 @@ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
-blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+blake2s-x86_64-y := blake2s-shash.o
+obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
+libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index 51d46d93efbc..b48ddebb4748 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -122,7 +122,7 @@ SYM_FUNC_START_LOCAL(__load_partial)
pxor T0, MSG
.Lld_partial_8:
- ret
+ RET
SYM_FUNC_END(__load_partial)
/*
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(__store_partial)
mov %r10b, (%r9)
.Lst_partial_1:
- ret
+ RET
SYM_FUNC_END(__store_partial)
/*
@@ -225,7 +225,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_init)
/*
@@ -337,7 +337,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_1:
movdqu STATE4, 0x00(STATEP)
@@ -346,7 +346,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_2:
movdqu STATE3, 0x00(STATEP)
@@ -355,7 +355,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_3:
movdqu STATE2, 0x00(STATEP)
@@ -364,7 +364,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out_4:
movdqu STATE1, 0x00(STATEP)
@@ -373,11 +373,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lad_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_ad)
.macro encrypt_block a s0 s1 s2 s3 s4 i
@@ -452,7 +452,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -461,7 +461,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -470,7 +470,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -479,7 +479,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -488,11 +488,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Lenc_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc)
/*
@@ -532,7 +532,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
.macro decrypt_block a s0 s1 s2 s3 s4 i
@@ -606,7 +606,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_1:
movdqu STATE3, 0x00(STATEP)
@@ -615,7 +615,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_2:
movdqu STATE2, 0x00(STATEP)
@@ -624,7 +624,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_3:
movdqu STATE1, 0x00(STATEP)
@@ -633,7 +633,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out_4:
movdqu STATE0, 0x00(STATEP)
@@ -642,11 +642,11 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
FRAME_END
- ret
+ RET
.Ldec_out:
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec)
/*
@@ -696,7 +696,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
movdqu STATE3, 0x40(STATEP)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
/*
@@ -743,5 +743,5 @@ SYM_FUNC_START(crypto_aegis128_aesni_final)
movdqu MSG, (%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(crypto_aegis128_aesni_final)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 3f0fc7dd87d7..c799838242a6 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -525,7 +525,7 @@ ddq_add_8:
/* return updated IV */
vpshufb xbyteswap, xcounter, xcounter
vmovdqu xcounter, (p_iv)
- ret
+ RET
.endm
/*
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 4e3972570916..363699dd7220 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1594,7 +1594,7 @@ SYM_FUNC_START(aesni_gcm_dec)
GCM_ENC_DEC dec
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec)
@@ -1683,7 +1683,7 @@ SYM_FUNC_START(aesni_gcm_enc)
GCM_COMPLETE arg10, arg11
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc)
/*****************************************************************************
@@ -1701,7 +1701,7 @@ SYM_FUNC_START(aesni_gcm_init)
FUNC_SAVE
GCM_INIT %arg3, %arg4,%arg5, %arg6
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init)
/*****************************************************************************
@@ -1716,7 +1716,7 @@ SYM_FUNC_START(aesni_gcm_enc_update)
FUNC_SAVE
GCM_ENC_DEC enc
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update)
/*****************************************************************************
@@ -1731,7 +1731,7 @@ SYM_FUNC_START(aesni_gcm_dec_update)
FUNC_SAVE
GCM_ENC_DEC dec
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update)
/*****************************************************************************
@@ -1746,7 +1746,7 @@ SYM_FUNC_START(aesni_gcm_finalize)
FUNC_SAVE
GCM_COMPLETE %arg3 %arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize)
#endif
@@ -1762,7 +1762,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256a)
pxor %xmm1, %xmm0
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256a)
SYM_FUNC_END_ALIAS(_key_expansion_128)
@@ -1787,7 +1787,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192a)
shufps $0b01001110, %xmm2, %xmm1
movaps %xmm1, 0x10(TKEYP)
add $0x20, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192a)
SYM_FUNC_START_LOCAL(_key_expansion_192b)
@@ -1806,7 +1806,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_192b)
movaps %xmm0, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_192b)
SYM_FUNC_START_LOCAL(_key_expansion_256b)
@@ -1818,7 +1818,7 @@ SYM_FUNC_START_LOCAL(_key_expansion_256b)
pxor %xmm1, %xmm2
movaps %xmm2, (TKEYP)
add $0x10, TKEYP
- ret
+ RET
SYM_FUNC_END(_key_expansion_256b)
/*
@@ -1933,7 +1933,7 @@ SYM_FUNC_START(aesni_set_key)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_set_key)
/*
@@ -1957,7 +1957,7 @@ SYM_FUNC_START(aesni_enc)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_enc)
/*
@@ -2014,7 +2014,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc1)
aesenc KEY, STATE
movaps 0x70(TKEYP), KEY
aesenclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_enc1)
/*
@@ -2122,7 +2122,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
aesenclast KEY, STATE2
aesenclast KEY, STATE3
aesenclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_enc4)
/*
@@ -2147,7 +2147,7 @@ SYM_FUNC_START(aesni_dec)
popl KEYP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_dec)
/*
@@ -2204,7 +2204,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec1)
aesdec KEY, STATE
movaps 0x70(TKEYP), KEY
aesdeclast KEY, STATE
- ret
+ RET
SYM_FUNC_END(_aesni_dec1)
/*
@@ -2312,7 +2312,7 @@ SYM_FUNC_START_LOCAL(_aesni_dec4)
aesdeclast KEY, STATE2
aesdeclast KEY, STATE3
aesdeclast KEY, STATE4
- ret
+ RET
SYM_FUNC_END(_aesni_dec4)
/*
@@ -2372,7 +2372,7 @@ SYM_FUNC_START(aesni_ecb_enc)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_enc)
/*
@@ -2433,7 +2433,7 @@ SYM_FUNC_START(aesni_ecb_dec)
popl LEN
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ecb_dec)
/*
@@ -2477,7 +2477,7 @@ SYM_FUNC_START(aesni_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_enc)
/*
@@ -2570,7 +2570,7 @@ SYM_FUNC_START(aesni_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cbc_dec)
/*
@@ -2627,7 +2627,7 @@ SYM_FUNC_START(aesni_cts_cbc_enc)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_enc)
/*
@@ -2688,7 +2688,7 @@ SYM_FUNC_START(aesni_cts_cbc_dec)
popl IVP
#endif
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_cts_cbc_dec)
.pushsection .rodata
@@ -2725,7 +2725,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc_init)
mov $1, TCTR_LOW
movq TCTR_LOW, INC
movq CTR, TCTR_LOW
- ret
+ RET
SYM_FUNC_END(_aesni_inc_init)
/*
@@ -2753,7 +2753,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc)
.Linc_low:
movaps CTR, IV
pshufb BSWAP_MASK, IV
- ret
+ RET
SYM_FUNC_END(_aesni_inc)
/*
@@ -2816,7 +2816,7 @@ SYM_FUNC_START(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(aesni_ctr_enc)
#endif
@@ -2932,7 +2932,7 @@ SYM_FUNC_START(aesni_xts_encrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_enc_1x:
add $64, LEN
@@ -3092,7 +3092,7 @@ SYM_FUNC_START(aesni_xts_decrypt)
popl IVP
#endif
FRAME_END
- ret
+ RET
.Lxts_dec_1x:
add $64, LEN
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 98e3552b6e03..0852ab573fd3 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -1767,7 +1767,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen2)
FUNC_SAVE
INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen2)
###############################################################################
@@ -1788,15 +1788,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
###############################################################################
@@ -1817,15 +1817,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update:
GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
###############################################################################
@@ -1846,15 +1846,15 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize:
GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
###############################################################################
@@ -2735,7 +2735,7 @@ SYM_FUNC_START(aesni_gcm_init_avx_gen4)
FUNC_SAVE
INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_init_avx_gen4)
###############################################################################
@@ -2756,15 +2756,15 @@ SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
FUNC_RESTORE
- ret
+ RET
key_128_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
FUNC_RESTORE
- ret
+ RET
key_256_enc_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
###############################################################################
@@ -2785,15 +2785,15 @@ SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
# must be 192
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
FUNC_RESTORE
- ret
+ RET
key_128_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
FUNC_RESTORE
- ret
+ RET
key_256_dec_update4:
GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
###############################################################################
@@ -2814,13 +2814,13 @@ SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
# must be 192
GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_128_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
FUNC_RESTORE
- ret
+ RET
key_256_finalize4:
GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
FUNC_RESTORE
- ret
+ RET
SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index e09f4672dd38..41901ba9d3a2 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1107,7 +1107,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
}, {
@@ -1124,7 +1124,7 @@ static struct aead_alg aesni_aeads[] = { {
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct generic_gcmaes_ctx),
- .cra_alignmask = AESNI_ALIGN - 1,
+ .cra_alignmask = 0,
.cra_module = THIS_MODULE,
},
} };
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
index 2ca79974f819..b50b35ff1fdb 100644
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@@ -171,7 +171,7 @@ SYM_FUNC_START(blake2s_compress_ssse3)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
- ret
+ RET
SYM_FUNC_END(blake2s_compress_ssse3)
#ifdef CONFIG_AS_AVX512
@@ -251,6 +251,6 @@ SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
- retq
+ RET
SYM_FUNC_END(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index a40365ab301e..69853c13e8fb 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -5,7 +5,6 @@
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
-#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
@@ -28,9 +27,8 @@ asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-void blake2s_compress_arch(struct blake2s_state *state,
- const u8 *block, size_t nblocks,
- const u32 inc)
+void blake2s_compress(struct blake2s_state *state, const u8 *block,
+ size_t nblocks, const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
@@ -56,49 +54,12 @@ void blake2s_compress_arch(struct blake2s_state *state,
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}
-EXPORT_SYMBOL(blake2s_compress_arch);
-
-static int crypto_blake2s_update_x86(struct shash_desc *desc,
- const u8 *in, unsigned int inlen)
-{
- return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
-}
-
-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
-{
- return crypto_blake2s_final(desc, out, blake2s_compress_arch);
-}
-
-#define BLAKE2S_ALG(name, driver_name, digest_size) \
- { \
- .base.cra_name = name, \
- .base.cra_driver_name = driver_name, \
- .base.cra_priority = 200, \
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
- .base.cra_module = THIS_MODULE, \
- .digestsize = digest_size, \
- .setkey = crypto_blake2s_setkey, \
- .init = crypto_blake2s_init, \
- .update = crypto_blake2s_update_x86, \
- .final = crypto_blake2s_final_x86, \
- .descsize = sizeof(struct blake2s_state), \
- }
-
-static struct shash_alg blake2s_algs[] = {
- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
-};
+EXPORT_SYMBOL(blake2s_compress);
static int __init blake2s_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&blake2s_use_ssse3);
+ if (boot_cpu_has(X86_FEATURE_SSSE3))
+ static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
@@ -109,26 +70,9 @@ static int __init blake2s_mod_init(void)
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
- crypto_register_shashes(blake2s_algs,
- ARRAY_SIZE(blake2s_algs)) : 0;
-}
-
-static void __exit blake2s_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
}
module_init(blake2s_mod_init);
-module_exit(blake2s_mod_exit);
-MODULE_ALIAS_CRYPTO("blake2s-128");
-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
-MODULE_ALIAS_CRYPTO("blake2s-160");
-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
-MODULE_ALIAS_CRYPTO("blake2s-224");
-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
-MODULE_ALIAS_CRYPTO("blake2s-256");
-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
new file mode 100644
index 000000000000..f9e2fecdb761
--- /dev/null
+++ b/arch/x86/crypto/blake2s-shash.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <crypto/internal/blake2s.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/hash.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+static int crypto_blake2s_update_x86(struct shash_desc *desc,
+ const u8 *in, unsigned int inlen)
+{
+ return crypto_blake2s_update(desc, in, inlen, blake2s_compress);
+}
+
+static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
+{
+ return crypto_blake2s_final(desc, out, blake2s_compress);
+}
+
+#define BLAKE2S_ALG(name, driver_name, digest_size) \
+ { \
+ .base.cra_name = name, \
+ .base.cra_driver_name = driver_name, \
+ .base.cra_priority = 200, \
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+ .base.cra_module = THIS_MODULE, \
+ .digestsize = digest_size, \
+ .setkey = crypto_blake2s_setkey, \
+ .init = crypto_blake2s_init, \
+ .update = crypto_blake2s_update_x86, \
+ .final = crypto_blake2s_final_x86, \
+ .descsize = sizeof(struct blake2s_state), \
+ }
+
+static struct shash_alg blake2s_algs[] = {
+ BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+};
+
+static int __init blake2s_mod_init(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+ return 0;
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4222ac6d6584..802d71582689 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -135,10 +135,10 @@ SYM_FUNC_START(__blowfish_enc_blk)
jnz .L__enc_xor;
write_block();
- ret;
+ RET;
.L__enc_xor:
xor_block();
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk)
SYM_FUNC_START(blowfish_dec_blk)
@@ -170,7 +170,7 @@ SYM_FUNC_START(blowfish_dec_blk)
movq %r11, %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk)
/**********************************************************************
@@ -322,14 +322,14 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
.L__enc_xor4:
xor_block4();
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(__blowfish_enc_blk_4way)
SYM_FUNC_START(blowfish_dec_blk_4way)
@@ -364,5 +364,5 @@ SYM_FUNC_START(blowfish_dec_blk_4way)
popq %rbx;
popq %r12;
- ret;
+ RET;
SYM_FUNC_END(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index e2a0e0f4bf9d..2e1658ddbe1a 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -192,7 +192,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -200,7 +200,7 @@ SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
%xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -778,7 +778,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -865,7 +865,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -906,7 +906,7 @@ SYM_FUNC_START(camellia_ecb_enc_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_16way)
SYM_FUNC_START(camellia_ecb_dec_16way)
@@ -936,7 +936,7 @@ SYM_FUNC_START(camellia_ecb_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_16way)
SYM_FUNC_START(camellia_cbc_dec_16way)
@@ -987,5 +987,5 @@ SYM_FUNC_START(camellia_cbc_dec_16way)
%xmm8, %rsi);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 706f70829a07..0e4e9abbf4de 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -226,7 +226,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
%rcx, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
.align 8
@@ -234,7 +234,7 @@ SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_a
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
%rax, (%r9));
- ret;
+ RET;
SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
/*
@@ -814,7 +814,7 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Lenc_max32:
@@ -901,7 +901,7 @@ SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
FRAME_END
- ret;
+ RET;
.align 8
.Ldec_max32:
@@ -946,7 +946,7 @@ SYM_FUNC_START(camellia_ecb_enc_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_enc_32way)
SYM_FUNC_START(camellia_ecb_dec_32way)
@@ -980,7 +980,7 @@ SYM_FUNC_START(camellia_ecb_dec_32way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_ecb_dec_32way)
SYM_FUNC_START(camellia_cbc_dec_32way)
@@ -1047,5 +1047,5 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
addq $(16 * 32), %rsp;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 1372e6408850..347c059f5940 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -213,13 +213,13 @@ SYM_FUNC_START(__camellia_enc_blk)
enc_outunpack(mov, RT1);
movq RR12, %r12;
- ret;
+ RET;
.L__enc_xor:
enc_outunpack(xor, RT1);
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk)
SYM_FUNC_START(camellia_dec_blk)
@@ -257,7 +257,7 @@ SYM_FUNC_START(camellia_dec_blk)
dec_outunpack();
movq RR12, %r12;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk)
/**********************************************************************
@@ -448,14 +448,14 @@ SYM_FUNC_START(__camellia_enc_blk_2way)
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
.L__enc2_xor:
enc_outunpack2(xor, RT2);
movq RR12, %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(__camellia_enc_blk_2way)
SYM_FUNC_START(camellia_dec_blk_2way)
@@ -495,5 +495,5 @@ SYM_FUNC_START(camellia_dec_blk_2way)
movq RR12, %r12;
movq RXOR, %rbx;
- ret;
+ RET;
SYM_FUNC_END(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 8a6181b08b59..b258af420c92 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -279,7 +279,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast5_enc_blk16)
.align 16
@@ -352,7 +352,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
- ret;
+ RET;
.L__skip_dec:
vpsrldq $4, RKR, RKR;
@@ -393,7 +393,7 @@ SYM_FUNC_START(cast5_ecb_enc_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_enc_16way)
SYM_FUNC_START(cast5_ecb_dec_16way)
@@ -431,7 +431,7 @@ SYM_FUNC_START(cast5_ecb_dec_16way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ecb_dec_16way)
SYM_FUNC_START(cast5_cbc_dec_16way)
@@ -483,7 +483,7 @@ SYM_FUNC_START(cast5_cbc_dec_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
SYM_FUNC_START(cast5_ctr_16way)
@@ -559,5 +559,5 @@ SYM_FUNC_START(cast5_ctr_16way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index fbddcecc3e3f..82b716fd5dba 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -289,7 +289,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_enc_blk8)
.align 8
@@ -336,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
- ret;
+ RET;
SYM_FUNC_END(__cast6_dec_blk8)
SYM_FUNC_START(cast6_ecb_enc_8way)
@@ -359,7 +359,7 @@ SYM_FUNC_START(cast6_ecb_enc_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_enc_8way)
SYM_FUNC_START(cast6_ecb_dec_8way)
@@ -382,7 +382,7 @@ SYM_FUNC_START(cast6_ecb_dec_8way)
popq %r15;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_ecb_dec_8way)
SYM_FUNC_START(cast6_cbc_dec_8way)
@@ -408,5 +408,5 @@ SYM_FUNC_START(cast6_cbc_dec_8way)
popq %r15;
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(cast6_cbc_dec_8way)
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
index ee9a40ab4109..f3d8fc018249 100644
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -193,7 +193,7 @@ SYM_FUNC_START(chacha_2block_xor_avx2)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -498,7 +498,7 @@ SYM_FUNC_START(chacha_4block_xor_avx2)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -992,7 +992,7 @@ SYM_FUNC_START(chacha_8block_xor_avx2)
.Ldone8:
vzeroupper
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
index bb193fde123a..946f74dd6fba 100644
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_2block_xor_avx512vl)
.Ldone2:
vzeroupper
- ret
+ RET
.Lxorpart2:
# xor remaining bytes from partial register into output
@@ -432,7 +432,7 @@ SYM_FUNC_START(chacha_4block_xor_avx512vl)
.Ldone4:
vzeroupper
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
@@ -812,7 +812,7 @@ SYM_FUNC_START(chacha_8block_xor_avx512vl)
.Ldone8:
vzeroupper
- ret
+ RET
.Lxorpart8:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index ca1788bfee16..7111949cd5b9 100644
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -108,7 +108,7 @@ SYM_FUNC_START_LOCAL(chacha_permute)
sub $2,%r8d
jnz .Ldoubleround
- ret
+ RET
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_ssse3)
@@ -166,7 +166,7 @@ SYM_FUNC_START(chacha_block_xor_ssse3)
.Ldone:
FRAME_END
- ret
+ RET
.Lxorpart:
# xor remaining bytes from partial register into output
@@ -217,7 +217,7 @@ SYM_FUNC_START(hchacha_block_ssse3)
movdqu %xmm3,0x10(%rsi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(hchacha_block_ssse3)
SYM_FUNC_START(chacha_4block_xor_ssse3)
@@ -762,7 +762,7 @@ SYM_FUNC_START(chacha_4block_xor_ssse3)
.Ldone4:
lea -8(%r10),%rsp
- ret
+ RET
.Lxorpart4:
# xor remaining bytes from partial register into output
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index 6e7d4c4d3208..c392a6edbfff 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -236,5 +236,5 @@ fold_64:
pxor %xmm2, %xmm1
pextrd $0x01, %xmm1, %eax
- ret
+ RET
SYM_FUNC_END(crc32_pclmul_le_16)
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index ac1f303eed0f..80c0d22fc42c 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -306,7 +306,7 @@ do_return:
popq %rsi
popq %rdi
popq %rbx
- ret
+ RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index b2533d63030e..721474abfb71 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -257,7 +257,7 @@ SYM_FUNC_START(crc_t10dif_pcl)
# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
pextrw $0, %xmm0, %eax
- ret
+ RET
.align 16
.Lless_than_256_bytes:
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 38caf61cd5b7..d55fa9e9b9e6 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -64,10 +64,9 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
/* Return the carry bit in a register */
" adcx %%r11, %1;"
- : "+&r" (f2), "=&r" (carry_r)
- : "r" (out), "r" (f1)
- : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2), "=&r"(carry_r)
+ : "r"(out), "r"(f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc");
return carry_r;
}
@@ -108,10 +107,9 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" cmovc %0, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ : "+&r"(f2)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
/* Computes the field subtraction of two field elements */
@@ -151,10 +149,9 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" movq %%r9, 8(%0);"
" movq %%r10, 16(%0);"
" movq %%r11, 24(%0);"
- :
- : "r" (out), "r" (f1), "r" (f2)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
- );
+ :
+ : "r"(out), "r"(f1), "r"(f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
}
/* Computes a field multiplication: out <- f1 * f2
@@ -162,239 +159,400 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication: tmp <- src1 * src2 */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 0(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
/* Computes two field multiplications:
- * out[0] <- f1[0] * f2[0]
- * out[1] <- f1[1] * f2[1]
- * Uses the 16-element buffer tmp for intermediate results. */
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results: */
static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
+
/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
/* Compute src1[0] * src2 */
- " movq 0(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 0(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 0(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 8(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 8(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 8(%2), %%r8;"
+ " movq %%r8, 8(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 16(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 16(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 16(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 16(%2), %%r8;"
+ " movq %%r8, 16(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 24(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 24(%1), %%rdx;"
- " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
- " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);"
- " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;"
- " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ " movq 24(%0), %%rdx;"
+ " mulxq 0(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 24(%2), %%r8;"
+ " movq %%r8, 24(%2);"
+ " mulxq 8(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 32(%2);"
+ " mulxq 16(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 40(%2);"
+ " mov $0, %%r8;"
+ " mulxq 24(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%2);"
/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
/* Compute src1[0] * src2 */
- " movq 32(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;"
+ " movq 32(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " movq %%r8, 64(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+
/* Compute src1[1] * src2 */
- " movq 40(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 40(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 72(%2), %%r8;"
+ " movq %%r8, 72(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 80(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[2] * src2 */
- " movq 48(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ " movq 48(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 80(%2), %%r8;"
+ " movq %%r8, 80(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 88(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+
/* Compute src1[3] * src2 */
- " movq 56(%1), %%rdx;"
- " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
- " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);"
- " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;"
- " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
- " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
+ " movq 56(%0), %%rdx;"
+ " mulxq 32(%1), %%r8, %%r9;"
+ " xor %%r10d, %%r10d;"
+ " adcxq 88(%2), %%r8;"
+ " movq %%r8, 88(%2);"
+ " mulxq 40(%1), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%rbx, %%r10;"
+ " movq %%r10, 96(%2);"
+ " mulxq 48(%1), %%rbx, %%r13;"
+ " adox %%r11, %%rbx;"
+ " adcx %%r14, %%rbx;"
+ " movq %%rbx, 104(%2);"
+ " mov $0, %%r8;"
+ " mulxq 56(%1), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%2);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%2);"
+
/* Line up pointers */
- " mov %0, %1;"
" mov %2, %0;"
+ " mov %3, %2;"
/* Wrap the results back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " mulxq 32(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 24(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 8(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 16(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 24(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 8(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 16(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 24(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%2);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
- " xor %k3, %k3;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " mulxq 96(%0), %%r8, %%r13;"
+ " xor %k1, %k1;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
- " adcx %3, %%rax;"
- " adox %3, %%rax;"
+ " adoxq 88(%0), %%r11;"
+ " adcx %1, %%rax;"
+ " adox %1, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
- " adcx %3, %%r9;"
- " movq %%r9, 40(%0);"
- " adcx %3, %%r10;"
- " movq %%r10, 48(%0);"
- " adcx %3, %%r11;"
- " movq %%r11, 56(%0);"
+ " adcx %1, %%r9;"
+ " movq %%r9, 40(%2);"
+ " adcx %1, %%r10;"
+ " movq %%r10, 48(%2);"
+ " adcx %1, %%r11;"
+ " movq %%r11, 56(%2);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
- :
- : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc"
- );
+ " movq %%r8, 32(%2);"
+ : "+&r"(f1), "+&r"(f2), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "%r14", "memory", "cc");
}
-/* Computes the field multiplication of four-element f1 with value in f2 */
+/* Computes the field multiplication of four-element f1 with value in f2
+ * Requires f2 to be smaller than 2^17 */
static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
register u64 f2_r asm("rdx") = f2;
asm volatile(
/* Compute the raw multiplication of f1*f2 */
- " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
- " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
- " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
" adcx %%rbx, %%r10;"
- " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
@@ -418,17 +576,17 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
- : "+&r" (f2_r)
- : "r" (out), "r" (f1)
- : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc"
- );
+ : "+&r"(f2_r)
+ : "r"(out), "r"(f1)
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
+ "memory", "cc");
}
/* Computes p1 <- bit ? p2 : p1 in constant time */
static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- /* Invert the polarity of bit to match cmov expectations */
+ /* Transfer bit into CF flag */
" add $18446744073709551615, %0;"
/* cswap p1[0], p2[0] */
@@ -502,10 +660,9 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" cmovc %%r10, %%r9;"
" movq %%r8, 56(%1);"
" movq %%r9, 56(%2);"
- : "+&r" (bit)
- : "r" (p1), "r" (p2)
- : "%r8", "%r9", "%r10", "memory", "cc"
- );
+ : "+&r"(bit)
+ : "r"(p1), "r"(p2)
+ : "%r8", "%r9", "%r10", "memory", "cc");
}
/* Computes the square of a field element: out <- f * f
@@ -516,15 +673,22 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Compute the raw multiplication: tmp <- f * f */
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -542,39 +706,50 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Wrap the result back into the field */
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -582,40 +757,47 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 0(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
/* Computes two field squarings:
- * out[0] <- f[0] * f[0]
- * out[1] <- f[1] * f[1]
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
* Uses the 16-element buffer tmp for intermediate results */
static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
/* Step 1: Compute all partial products */
- " movq 0(%1), %%rdx;" /* f[0] */
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 24(%1), %%rdx;" /* f[3] */
- " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 0(%0), %%rdx;" /* f[0] */
+ " mulxq 8(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 16(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%0), %%rdx;" /* f[3] */
+ " mulxq 8(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 8(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -633,29 +815,47 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 0(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+ " movq 0(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%1);"
+ " movq 8(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%1);"
+ " movq 16(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 40(%1);"
+ " movq 24(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%1);"
/* Step 1: Compute all partial products */
- " movq 32(%1), %%rdx;" /* f[0] */
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */
- " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
- " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
- " movq 56(%1), %%rdx;" /* f[3] */
- " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
- " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */
- " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
- " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+ " movq 32(%0), %%rdx;" /* f[0] */
+ " mulxq 40(%0), %%r8, %%r14;"
+ " xor %%r15d, %%r15d;" /* f[1]*f[0] */
+ " mulxq 48(%0), %%r9, %%r10;"
+ " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%0), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%0), %%rdx;" /* f[3] */
+ " mulxq 40(%0), %%r11, %%rbx;"
+ " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%0), %%rax, %%r13;"
+ " adcx %%rax, %%rbx;" /* f[2]*f[3] */
+ " movq 40(%0), %%rdx;"
+ " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%0), %%rax, %%rcx;"
+ " mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
@@ -673,37 +873,48 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
- " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
- " movq %%rax, 64(%0);"
- " add %%rcx, %%r8;" " movq %%r8, 72(%0);"
- " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
- " adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
- " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
- " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
- " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
- " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);"
- " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
- " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
- " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
+ " movq 32(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%1);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%1);"
+ " movq 40(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%1);"
+ " movq 48(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%1);"
+ " adcx %%rcx, %%rbx;"
+ " movq %%rbx, 104(%1);"
+ " movq 56(%0), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%1);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%1);"
/* Line up pointers */
- " mov %0, %1;"
- " mov %2, %0;"
+ " mov %1, %0;"
+ " mov %2, %1;"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 32(%1), %%r8, %%r13;"
+ " mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 0(%1), %%r8;"
- " mulxq 40(%1), %%r9, %%rbx;"
+ " adoxq 0(%0), %%r8;"
+ " mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 8(%1), %%r9;"
- " mulxq 48(%1), %%r10, %%r13;"
+ " adoxq 8(%0), %%r9;"
+ " mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 16(%1), %%r10;"
- " mulxq 56(%1), %%r11, %%rax;"
+ " adoxq 16(%0), %%r10;"
+ " mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 24(%1), %%r11;"
+ " adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -711,32 +922,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 8(%0);"
+ " movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 16(%0);"
+ " movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 24(%0);"
+ " movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 0(%0);"
+ " movq %%r8, 0(%1);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
- " mulxq 96(%1), %%r8, %%r13;"
+ " mulxq 96(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
- " adoxq 64(%1), %%r8;"
- " mulxq 104(%1), %%r9, %%rbx;"
+ " adoxq 64(%0), %%r8;"
+ " mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
- " adoxq 72(%1), %%r9;"
- " mulxq 112(%1), %%r10, %%r13;"
+ " adoxq 72(%0), %%r9;"
+ " mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
- " adoxq 80(%1), %%r10;"
- " mulxq 120(%1), %%r11, %%rax;"
+ " adoxq 80(%0), %%r10;"
+ " mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
- " adoxq 88(%1), %%r11;"
+ " adoxq 88(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
@@ -744,21 +955,21 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
- " movq %%r9, 40(%0);"
+ " movq %%r9, 40(%1);"
" adcx %%rcx, %%r10;"
- " movq %%r10, 48(%0);"
+ " movq %%r10, 48(%1);"
" adcx %%rcx, %%r11;"
- " movq %%r11, 56(%0);"
+ " movq %%r11, 56(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
- " movq %%r8, 32(%0);"
- : "+&r" (tmp), "+&r" (f), "+&r" (out)
- :
- : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc"
- );
+ " movq %%r8, 32(%1);"
+ : "+&r"(f), "+&r"(tmp)
+ : "r"(out)
+ : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
+ "%r13", "%r14", "%r15", "memory", "cc");
}
static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index fac0fdc3f25d..f4c760f4cade 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -243,7 +243,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
/***********************************************************************
@@ -528,7 +528,7 @@ SYM_FUNC_START(des3_ede_x86_64_crypt_blk_3way)
popq %r12;
popq %rbx;
- ret;
+ RET;
SYM_FUNC_END(des3_ede_x86_64_crypt_blk_3way)
.section .rodata, "a", @progbits
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index e7cb68a3db3b..787c234d2469 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -164,7 +164,7 @@ static int cbc_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_encrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
@@ -243,7 +243,7 @@ static int cbc_decrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false);
- while ((nbytes = walk.nbytes)) {
+ while (walk.nbytes) {
nbytes = __cbc_decrypt(ctx, &walk);
err = skcipher_walk_done(&walk, nbytes);
}
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 99ac25e18e09..2bf871899920 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -85,7 +85,7 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
psrlq $1, T2
pxor T2, T1
pxor T1, DATA
- ret
+ RET
SYM_FUNC_END(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
@@ -99,7 +99,7 @@ SYM_FUNC_START(clmul_ghash_mul)
pshufb BSWAP, DATA
movups DATA, (%rdi)
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_mul)
/*
@@ -128,5 +128,5 @@ SYM_FUNC_START(clmul_ghash_update)
movups DATA, (%rdi)
.Lupdate_just_ret:
FRAME_END
- ret
+ RET
SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index b22c7b936272..6a0b15e7196a 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -153,5 +153,5 @@ SYM_FUNC_START(nh_avx2)
vpaddq T1, T0, T0
vpaddq T4, T0, T0
vmovdqu T0, (HASH)
- ret
+ RET
SYM_FUNC_END(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index d7ae22dd6683..34c567bbcb4f 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -119,5 +119,5 @@ SYM_FUNC_START(nh_sse2)
paddq PASS2_SUMS, T1
movdqu T0, 0x00(HASH)
movdqu T1, 0x10(HASH)
- ret
+ RET
SYM_FUNC_END(nh_sse2)
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index b7ee24df7fba..82f2313f512b 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk8_avx)
SYM_FUNC_START(serpent_ecb_enc_8way_avx)
@@ -673,7 +673,7 @@ SYM_FUNC_START(serpent_ecb_enc_8way_avx)
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_8way_avx)
SYM_FUNC_START(serpent_ecb_dec_8way_avx)
@@ -691,7 +691,7 @@ SYM_FUNC_START(serpent_ecb_dec_8way_avx)
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_8way_avx)
SYM_FUNC_START(serpent_cbc_dec_8way_avx)
@@ -709,5 +709,5 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx)
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 9161b6e441f3..8ea34c9b9316 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -601,7 +601,7 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk16)
.align 8
@@ -655,7 +655,7 @@ SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_dec_blk16)
SYM_FUNC_START(serpent_ecb_enc_16way)
@@ -677,7 +677,7 @@ SYM_FUNC_START(serpent_ecb_enc_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_enc_16way)
SYM_FUNC_START(serpent_ecb_dec_16way)
@@ -699,7 +699,7 @@ SYM_FUNC_START(serpent_ecb_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_ecb_dec_16way)
SYM_FUNC_START(serpent_cbc_dec_16way)
@@ -722,5 +722,5 @@ SYM_FUNC_START(serpent_cbc_dec_16way)
vzeroupper;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(serpent_cbc_dec_16way)
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
index 6379b99cb722..8ccb03ad7cef 100644
--- a/arch/x86/crypto/serpent-sse2-i586-asm_32.S
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -553,12 +553,12 @@ SYM_FUNC_START(__serpent_enc_blk_4way)
write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
.L__enc_xor4:
xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_4way)
SYM_FUNC_START(serpent_dec_blk_4way)
@@ -612,5 +612,5 @@ SYM_FUNC_START(serpent_dec_blk_4way)
movl arg_dst(%esp), %eax;
write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_4way)
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
index efb6dc17dc90..e0998a011d1d 100644
--- a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -675,13 +675,13 @@ SYM_FUNC_START(__serpent_enc_blk_8way)
write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
.L__enc_xor8:
xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(__serpent_enc_blk_8way)
SYM_FUNC_START(serpent_dec_blk_8way)
@@ -735,5 +735,5 @@ SYM_FUNC_START(serpent_dec_blk_8way)
write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
- ret;
+ RET;
SYM_FUNC_END(serpent_dec_blk_8way)
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 5eed620f4676..a96b2fd26dab 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -674,7 +674,7 @@ _loop3:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 5d8415f482bd..2f94ec0e763b 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -290,7 +290,7 @@ SYM_FUNC_START(sha1_ni_transform)
mov %rbp, %rsp
pop %rbp
- ret
+ RET
SYM_FUNC_END(sha1_ni_transform)
.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index d25668d2a1e9..263f916362e0 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -99,7 +99,7 @@
pop %rbp
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(\name)
.endm
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 4739cd31b9db..3baa1ec39097 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -458,7 +458,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_avx)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 4087f7432a7e..9bcdbc47b8b4 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -710,7 +710,7 @@ done_hash:
popq %r13
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_rorx)
.section .rodata.cst512.K256, "aM", @progbits, 512
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index ddfa863b4ee3..c4a5db612c32 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -472,7 +472,7 @@ done_hash:
popq %r12
popq %rbx
- ret
+ RET
SYM_FUNC_END(sha256_transform_ssse3)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 7abade04a3a3..94d50dd27cb5 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -326,7 +326,7 @@ SYM_FUNC_START(sha256_ni_transform)
.Ldone_hash:
- ret
+ RET
SYM_FUNC_END(sha256_ni_transform)
.section .rodata.cst256.K256, "aM", @progbits, 256
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 3d8f0fd4eea8..1fefe6dd3a9e 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -361,7 +361,7 @@ updateblock:
pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_avx)
########################################################################
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 072cb0f0deae..5cdaab7d6901 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -679,7 +679,7 @@ done_hash:
pop %r12
pop %rbx
- ret
+ RET
SYM_FUNC_END(sha512_transform_rorx)
########################################################################
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index bd51c9070bed..b84c22e06c5f 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -363,7 +363,7 @@ updateblock:
pop %rbx
nowork:
- ret
+ RET
SYM_FUNC_END(sha512_transform_ssse3)
########################################################################
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index 1cc72b4804fa..4767ab61ff48 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -246,7 +246,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4)
.Lblk4_store_output_done:
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_crypt4)
.align 8
@@ -356,7 +356,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
vpshufb RTMP2, RB3, RB3;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(__sm4_crypt_blk8)
/*
@@ -412,7 +412,7 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8)
.Lblk8_store_output_done:
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_crypt8)
/*
@@ -487,7 +487,7 @@ SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
/*
@@ -537,7 +537,7 @@ SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
/*
@@ -590,5 +590,5 @@ SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx_cfb_dec_blk8)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index 9c5d3f3ad45a..4732fe8bb65b 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -268,7 +268,7 @@ SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
vpshufb RTMP2, RB3, RB3;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(__sm4_crypt_blk16)
#define inc_le128(x, minus_one, tmp) \
@@ -387,7 +387,7 @@ SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
/*
@@ -441,7 +441,7 @@ SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
/*
@@ -497,5 +497,5 @@ SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
vzeroall;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(sm4_aesni_avx2_cfb_dec_blk16)
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 37e63b3c664e..31f9b2ec3857 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -267,7 +267,7 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk8)
.align 8
@@ -307,7 +307,7 @@ SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
- ret;
+ RET;
SYM_FUNC_END(__twofish_dec_blk8)
SYM_FUNC_START(twofish_ecb_enc_8way)
@@ -327,7 +327,7 @@ SYM_FUNC_START(twofish_ecb_enc_8way)
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_enc_8way)
SYM_FUNC_START(twofish_ecb_dec_8way)
@@ -347,7 +347,7 @@ SYM_FUNC_START(twofish_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_ecb_dec_8way)
SYM_FUNC_START(twofish_cbc_dec_8way)
@@ -372,5 +372,5 @@ SYM_FUNC_START(twofish_cbc_dec_8way)
popq %r12;
FRAME_END
- ret;
+ RET;
SYM_FUNC_END(twofish_cbc_dec_8way)
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index a6f09e4f2e46..3abcad661884 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -260,7 +260,7 @@ SYM_FUNC_START(twofish_enc_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -317,5 +317,5 @@ SYM_FUNC_START(twofish_dec_blk)
pop %ebx
pop %ebp
mov $1, %eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index bca4cea757ce..d2288bf38a8a 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -258,7 +258,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
.L__enc_xor3:
outunpack_enc3(xor);
@@ -266,7 +266,7 @@ SYM_FUNC_START(__twofish_enc_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(__twofish_enc_blk_3way)
SYM_FUNC_START(twofish_dec_blk_3way)
@@ -301,5 +301,5 @@ SYM_FUNC_START(twofish_dec_blk_3way)
popq %rbx;
popq %r12;
popq %r13;
- ret;
+ RET;
SYM_FUNC_END(twofish_dec_blk_3way)
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index d2e56232494a..775af290cd19 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -252,7 +252,7 @@ SYM_FUNC_START(twofish_enc_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_enc_blk)
SYM_FUNC_START(twofish_dec_blk)
@@ -304,5 +304,5 @@ SYM_FUNC_START(twofish_dec_blk)
popq R1
movl $1,%eax
- ret
+ RET
SYM_FUNC_END(twofish_dec_blk)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ccb9d32768f3..887420844066 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -268,19 +268,16 @@
1: popl %ds
2: popl %es
3: popl %fs
- addl $(4 + \pop), %esp /* pop the unused "gs" slot */
+4: addl $(4 + \pop), %esp /* pop the unused "gs" slot */
IRET_FRAME
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b, 4b)
- _ASM_EXTABLE(2b, 5b)
- _ASM_EXTABLE(3b, 6b)
+
+ /*
+ * There is no _ASM_EXTABLE_TYPE_REG() for ASM, however since this is
+ * ASM the registers are known and we can trivially hard-code them.
+ */
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_POP_ZERO|EX_REG_DS)
+ _ASM_EXTABLE_TYPE(2b, 3b, EX_TYPE_POP_ZERO|EX_REG_ES)
+ _ASM_EXTABLE_TYPE(3b, 4b, EX_TYPE_POP_ZERO|EX_REG_FS)
.endm
.macro RESTORE_ALL_NMI cr3_reg:req pop=0
@@ -740,7 +737,7 @@ SYM_FUNC_START(schedule_tail_wrapper)
popl %eax
FRAME_END
- ret
+ RET
SYM_FUNC_END(schedule_tail_wrapper)
.popsection
@@ -925,10 +922,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
sti
sysexit
-.pushsection .fixup, "ax"
-2: movl $0, PT_FS(%esp)
- jmp 1b
-.popsection
+2: movl $0, PT_FS(%esp)
+ jmp 1b
_ASM_EXTABLE(1b, 2b)
.Lsysenter_fix_flags:
@@ -996,8 +991,7 @@ restore_all_switch_stack:
*/
iret
-.section .fixup, "ax"
-SYM_CODE_START(asm_iret_error)
+.Lasm_iret_error:
pushl $0 # no error code
pushl $iret_error
@@ -1014,9 +1008,8 @@ SYM_CODE_START(asm_iret_error)
#endif
jmp handle_exception
-SYM_CODE_END(asm_iret_error)
-.previous
- _ASM_EXTABLE(.Lirq_return, asm_iret_error)
+
+ _ASM_EXTABLE(.Lirq_return, .Lasm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -1248,14 +1241,14 @@ SYM_CODE_START(asm_exc_nmi)
SYM_CODE_END(asm_exc_nmi)
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
- call do_exit
+ call make_task_dead
1: jmp 1b
-SYM_CODE_END(rewind_stack_do_exit)
+SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e38a4cf795d9..466df3e50276 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
ud2
1:
#endif
+#ifdef CONFIG_XEN_PV
+ ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+#endif
+
POP_REGS pop_rdi=0
/*
@@ -734,14 +738,10 @@ SYM_FUNC_START(asm_load_gs_index)
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
swapgs
FRAME_END
- ret
-SYM_FUNC_END(asm_load_gs_index)
-EXPORT_SYMBOL(asm_load_gs_index)
+ RET
- _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
- .section .fixup, "ax"
/* running with kernelgs */
-SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
+.Lbad_gs:
swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
@@ -752,8 +752,11 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
xorl %eax, %eax
movl %eax, %gs
jmp 2b
-SYM_CODE_END(.Lbad_gs)
- .previous
+
+ _ASM_EXTABLE(.Lgs_change, .Lbad_gs)
+
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
#ifdef CONFIG_XEN_PV
/*
@@ -885,11 +888,12 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* is needed here.
*/
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
- ret
+ RET
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
+
/*
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
@@ -897,22 +901,15 @@ SYM_CODE_START_LOCAL(paranoid_entry)
movl $MSR_GS_BASE, %ecx
rdmsr
testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
+ js .Lparanoid_kernel_gsbase
-.Lparanoid_entry_swapgs:
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
swapgs
+.Lparanoid_kernel_gsbase:
- /*
- * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
- * unconditional CR3 write, even in the PTI case. So do an lfence
- * to prevent GS speculation, regardless of whether PTI is enabled.
- */
FENCE_SWAPGS_KERNEL_ENTRY
-
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
- ret
+ RET
SYM_CODE_END(paranoid_entry)
/*
@@ -991,12 +988,7 @@ SYM_CODE_START_LOCAL(error_entry)
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
- ret
-
-.Lerror_entry_done_lfence:
- FENCE_SWAPGS_KERNEL_ENTRY
-.Lerror_entry_done:
- ret
+ RET
/*
* There are two places in the kernel that can potentially fault with
@@ -1020,8 +1012,14 @@ SYM_CODE_START_LOCAL(error_entry)
* .Lgs_change's error handler with kernel gsbase.
*/
SWAPGS
- FENCE_SWAPGS_USER_ENTRY
- jmp .Lerror_entry_done
+
+ /*
+ * Issue an LFENCE to prevent GS speculation, regardless of whether it is a
+ * kernel or user gsbase.
+ */
+.Lerror_entry_done_lfence:
+ FENCE_SWAPGS_KERNEL_ENTRY
+ RET
.Lbstep_iret:
/* Fix truncated RIP */
@@ -1429,7 +1427,7 @@ SYM_CODE_END(ignore_sysret)
#endif
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_do_exit)
+SYM_CODE_START(rewind_stack_and_make_dead)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
@@ -1438,6 +1436,6 @@ SYM_CODE_START(rewind_stack_do_exit)
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
- call do_exit
-SYM_CODE_END(rewind_stack_do_exit)
+ call make_task_dead
+SYM_CODE_END(rewind_stack_and_make_dead)
.popsection
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7e25543693de..320480a8db4f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -454,3 +454,4 @@
447 i386 memfd_secret sys_memfd_secret
448 i386 process_mrelease sys_process_mrelease
449 i386 futex_waitv sys_futex_waitv
+450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index fe8f8dd157b4..c84d12608cd2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -371,6 +371,7 @@
447 common memfd_secret sys_memfd_secret
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
+450 common set_mempolicy_home_node sys_set_mempolicy_home_node
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index f1f96d4d8cd6..7591bab060f7 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -24,7 +24,7 @@ SYM_CODE_START_NOALIGN(\name)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
_ASM_NOKPROBE(\name)
SYM_CODE_END(\name)
.endm
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 496b11ec469d..505b488fcc65 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -50,7 +50,7 @@ SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
popq %rsi
popq %rdi
popq %rbp
- ret
+ RET
_ASM_NOKPROBE(__thunk_restore)
SYM_CODE_END(__thunk_restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index a2dddcc189f6..693f8b9031fb 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -172,7 +172,7 @@ $(obj)/vdso32.so.dbg: $(obj)/vdso32/vdso32.lds $(vobjs32) FORCE
# The DSO images are built using a special linker script.
#
quiet_cmd_vdso = VDSO $@
- cmd_vdso = $(LD) -nostdlib -o $@ \
+ cmd_vdso = $(LD) -o $@ \
$(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-T $(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index dc8da7695859..bafa73f09e92 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -77,7 +77,6 @@ SECTIONS
.text : {
*(.text*)
- *(.fixup)
} :text =0x90909090,
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 6ddd7a937b3e..d33c6513fd2c 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -78,7 +78,7 @@ SYM_INNER_LABEL(int80_landing_pad, SYM_L_GLOBAL)
popl %ecx
CFI_RESTORE ecx
CFI_ADJUST_CFA_OFFSET -4
- ret
+ RET
CFI_ENDPROC
.size __kernel_vsyscall,.-__kernel_vsyscall
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 99dafac992e2..d77d278ee9dd 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -81,7 +81,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
pop %rbx
leave
.cfi_def_cfa %rsp, 8
- ret
+ RET
/* The out-of-line code runs with the pre-leave stack frame. */
.cfi_def_cfa %rbp, 16
diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index 2e203f3a25a7..15e35159ebb6 100644
--- a/arch/x86/entry/vsyscall/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
@@ -19,17 +19,17 @@ __vsyscall_page:
mov $__NR_gettimeofday, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_time, %rax
syscall
- ret
+ RET
.balign 1024, 0xcc
mov $__NR_getcpu, %rax
syscall
- ret
+ RET
.balign 4096, 0xcc
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 913745f1419b..b15f7b950d2e 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -161,7 +161,7 @@ static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
raw_spin_lock_irqsave(&piommu->lock, flags);
- for (bank = 0, shift = 0; bank < max_banks; bank++) {
+ for (bank = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..e686c5e0537b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
if (READ_ONCE(x86_pmu.attr_rdpmc) &&
!(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
- event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+ event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
return err;
}
@@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
/*
@@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
- if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
return;
if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
return 0;
if (is_metric_idx(hwc->idx))
@@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc =
- !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+ !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!using_native_sched_clock() || !sched_clock_stable())
@@ -2771,7 +2771,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
struct unwind_state state;
unsigned long addr;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2874,7 +2874,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
struct stack_frame frame;
const struct stack_frame __user *fp;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
return;
}
@@ -2951,18 +2951,19 @@ static unsigned long code_segment_base(struct pt_regs *regs)
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- return perf_guest_cbs->get_guest_ip();
+ if (perf_guest_state())
+ return perf_guest_get_ip();
return regs->ip + code_segment_base(regs);
}
unsigned long perf_misc_flags(struct pt_regs *regs)
{
+ unsigned int guest_state = perf_guest_state();
int misc = 0;
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
+ if (guest_state) {
+ if (guest_state & PERF_GUEST_USER)
misc |= PERF_RECORD_MISC_GUEST_USER;
else
misc |= PERF_RECORD_MISC_GUEST_KERNEL;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ec6444f2c9dc..c91434056c29 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2901,10 +2901,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) {
handled++;
- if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() &&
- perf_guest_cbs->handle_intel_pt_intr))
- perf_guest_cbs->handle_intel_pt_intr();
- else
+ if (!perf_guest_handle_intel_pt_intr())
intel_pt_interrupt();
}
@@ -6239,6 +6236,19 @@ __init int intel_pmu_init(void)
pmu->num_counters = x86_pmu.num_counters;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
}
+
+ /*
+ * Quirk: For some Alder Lake machine, when all E-cores are disabled in
+ * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However,
+ * the X86_FEATURE_HYBRID_CPU is still set. The above codes will
+ * mistakenly add extra counters for P-cores. Correct the number of
+ * counters here.
+ */
+ if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) {
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ }
+
pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
pmu->unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
@@ -6343,6 +6353,8 @@ __init int intel_pmu_init(void)
}
if (x86_pmu.lbr_nr) {
+ intel_pmu_lbr_init();
+
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
/* only support branch_stack snapshot for perfmon >= v2 */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8043213b75a5..669c2be14784 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,14 +8,6 @@
#include "../perf_event.h"
-static const enum {
- LBR_EIP_FLAGS = 1,
- LBR_TSX = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
- [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
- [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
/*
* Intel LBR_SELECT bits
* Intel Vol3a, April 2011, Section 16.7 Table 16-10
@@ -243,7 +235,7 @@ void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (x86_pmu.lbr_has_info)
wrmsrl(x86_pmu.lbr_info + i, 0);
}
}
@@ -305,11 +297,10 @@ enum {
*/
static inline bool lbr_from_signext_quirk_needed(void)
{
- int lbr_format = x86_pmu.intel_cap.lbr_format;
bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
boot_cpu_has(X86_FEATURE_RTM);
- return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX);
+ return !tsx_support && x86_pmu.lbr_has_tsx;
}
static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
@@ -427,12 +418,12 @@ rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
void intel_pmu_lbr_restore(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
- int i;
- unsigned lbr_idx, mask;
+ bool need_info = x86_pmu.lbr_has_info;
u64 tos = task_ctx->tos;
+ unsigned lbr_idx, mask;
+ int i;
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
@@ -444,7 +435,7 @@ void intel_pmu_lbr_restore(void *ctx)
lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, 0);
wrlbr_to(lbr_idx, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (need_info)
wrlbr_info(lbr_idx, 0);
}
@@ -519,9 +510,9 @@ static void __intel_pmu_lbr_restore(void *ctx)
void intel_pmu_lbr_save(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
+ bool need_info = x86_pmu.lbr_has_info;
unsigned lbr_idx, mask;
u64 tos;
int i;
@@ -816,7 +807,6 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
- int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
@@ -831,9 +821,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
- int skip = 0;
u16 cycles = 0;
- int lbr_flags = lbr_desc[lbr_format];
from = rdlbr_from(lbr_idx, NULL);
to = rdlbr_to(lbr_idx, NULL);
@@ -845,37 +833,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (call_stack && !from)
break;
- if (lbr_format == LBR_FORMAT_INFO && need_info) {
- u64 info;
-
- info = rdlbr_info(lbr_idx, NULL);
- mis = !!(info & LBR_INFO_MISPRED);
- pred = !mis;
- in_tx = !!(info & LBR_INFO_IN_TX);
- abort = !!(info & LBR_INFO_ABORT);
- cycles = (info & LBR_INFO_CYCLES);
- }
-
- if (lbr_format == LBR_FORMAT_TIME) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- cycles = ((to >> 48) & LBR_INFO_CYCLES);
-
- to = (u64)((((s64)to) << 16) >> 16);
- }
-
- if (lbr_flags & LBR_EIP_FLAGS) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- }
- if (lbr_flags & LBR_TSX) {
- in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
- abort = !!(from & LBR_FROM_FLAG_ABORT);
- skip = 3;
+ if (x86_pmu.lbr_has_info) {
+ if (need_info) {
+ u64 info;
+
+ info = rdlbr_info(lbr_idx, NULL);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ cycles = (info & LBR_INFO_CYCLES);
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ }
+ }
+ } else {
+ int skip = 0;
+
+ if (x86_pmu.lbr_from_flags) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ }
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ if (x86_pmu.lbr_to_cycles) {
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
}
- from = (u64)((((s64)from) << skip) >> skip);
/*
* Some CPUs report duplicated abort records,
@@ -903,37 +893,40 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_stack.hw_idx = tos;
}
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_type);
+
static __always_inline int get_lbr_br_type(u64 info)
{
- if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
- return 0;
+ int type = 0;
- return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+ if (static_branch_likely(&x86_lbr_type))
+ type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+
+ return type;
}
static __always_inline bool get_lbr_mispred(u64 info)
{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
+ bool mispred = 0;
- return !!(info & LBR_INFO_MISPRED);
-}
+ if (static_branch_likely(&x86_lbr_mispred))
+ mispred = !!(info & LBR_INFO_MISPRED);
-static __always_inline bool get_lbr_predicted(u64 info)
-{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
-
- return !(info & LBR_INFO_MISPRED);
+ return mispred;
}
static __always_inline u16 get_lbr_cycles(u64 info)
{
+ u16 cycles = info & LBR_INFO_CYCLES;
+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
- !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
- return 0;
+ (!static_branch_likely(&x86_lbr_cycles) ||
+ !(info & LBR_INFO_CYC_CNT_VALID)))
+ cycles = 0;
- return info & LBR_INFO_CYCLES;
+ return cycles;
}
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
@@ -961,7 +954,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->from = from;
e->to = to;
e->mispred = get_lbr_mispred(info);
- e->predicted = get_lbr_predicted(info);
+ e->predicted = !e->mispred;
e->in_tx = !!(info & LBR_INFO_IN_TX);
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
@@ -1120,7 +1113,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
(br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
- (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ x86_pmu.lbr_has_info)
reg->config |= LBR_NO_INFO;
return 0;
@@ -1706,6 +1699,38 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
}
+void intel_pmu_lbr_init(void)
+{
+ switch (x86_pmu.intel_cap.lbr_format) {
+ case LBR_FORMAT_EIP_FLAGS2:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_EIP_FLAGS:
+ x86_pmu.lbr_from_flags = 1;
+ break;
+
+ case LBR_FORMAT_INFO:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_INFO2:
+ x86_pmu.lbr_has_info = 1;
+ break;
+
+ case LBR_FORMAT_TIME:
+ x86_pmu.lbr_from_flags = 1;
+ x86_pmu.lbr_to_cycles = 1;
+ break;
+ }
+
+ if (x86_pmu.lbr_has_info) {
+ /*
+ * Only used in combination with baseline pebs.
+ */
+ static_branch_enable(&x86_lbr_mispred);
+ static_branch_enable(&x86_lbr_cycles);
+ }
+}
+
/*
* LBR state size is variable based on the max number of registers.
* This calculates the expected state size, which should match
@@ -1726,6 +1751,9 @@ static bool is_arch_lbr_xsave_available(void)
* Check the LBR state with the corresponding software structure.
* Disable LBR XSAVES support if the size doesn't match.
*/
+ if (xfeature_size(XFEATURE_LBR) == 0)
+ return false;
+
if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
return false;
@@ -1765,6 +1793,12 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
x86_pmu.lbr_nr = lbr_nr;
+ if (x86_pmu.lbr_mispred)
+ static_branch_enable(&x86_lbr_mispred);
+ if (x86_pmu.lbr_timed_lbr)
+ static_branch_enable(&x86_lbr_cycles);
+ if (x86_pmu.lbr_br_type)
+ static_branch_enable(&x86_lbr_type);
arch_lbr_xsave = is_arch_lbr_xsave_available();
if (arch_lbr_xsave) {
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index f1ba6ab2e97e..e497da9bf427 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1762,7 +1762,7 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
.cpu_init = adl_uncore_cpu_init,
- .mmio_init = tgl_uncore_mmio_init,
+ .mmio_init = adl_uncore_mmio_init,
};
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b9687980aab6..2adeaf4de4df 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -584,10 +584,11 @@ void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
-void adl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
+void adl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 3049c646fa20..6ddadb482f68 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -494,8 +494,8 @@ void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
writel(0, box->io_addr);
}
-static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
- struct perf_event *event)
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 6d735611c281..cfaf558bdb6b 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -139,6 +139,8 @@ void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
struct perf_event *event);
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 0f63706cdadf..f698a55bde81 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
#include "uncore.h"
+#include "uncore_discovery.h"
/* Uncore IMC PCI IDs */
#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
@@ -64,6 +65,20 @@
#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660
#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641
+#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601
+#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602
+#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609
+#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a
+#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621
+#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623
+#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629
+#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637
+#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b
+#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648
+#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649
+#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
+#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
+#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -155,6 +170,7 @@
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
@@ -1334,6 +1350,62 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ }
};
@@ -1390,7 +1462,8 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
-static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
@@ -1417,11 +1490,17 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
addr |= ((resource_size_t)mch_bar << 32);
#endif
+ addr += base_offset;
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
}
+static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, 0);
+}
+
static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
.init_box = tgl_uncore_imc_freerunning_init_box,
.exit_box = uncore_mmio_exit_box,
@@ -1469,3 +1548,136 @@ void tgl_uncore_mmio_init(void)
}
/* end of Tiger Lake MMIO uncore support */
+
+/* Alder Lake MMIO uncore support */
+#define ADL_UNCORE_IMC_BASE 0xd900
+#define ADL_UNCORE_IMC_MAP_SIZE 0x200
+#define ADL_UNCORE_IMC_CTR 0xe8
+#define ADL_UNCORE_IMC_CTRL 0xd0
+#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0
+#define ADL_UNCORE_IMC_BOX_CTL 0xc4
+#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800
+#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100
+
+#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0)
+#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1)
+#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2)
+#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \
+ ADL_UNCORE_IMC_CTL_RST_CTRS)
+
+static void adl_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE);
+
+ /* The global control in MC1 can control both MCs. */
+ if (box->io_addr && (box->pmu->pmu_idx == 1))
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL);
+}
+
+static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static struct intel_uncore_ops adl_uncore_mmio_ops = {
+ .init_box = adl_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = adl_uncore_mmio_disable_box,
+ .enable_box = adl_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
+#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ ADL_UNC_CTL_CHMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET)
+
+static struct attribute *adl_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_chmask.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_imc_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = ADL_UNCORE_IMC_CTR,
+ .event_ctl = ADL_UNCORE_IMC_CTRL,
+ .event_mask = ADL_UNC_IMC_EVENT_MASK,
+ .box_ctl = ADL_UNCORE_IMC_BOX_CTL,
+ .mmio_offset = 0,
+ .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE,
+ .ops = &adl_uncore_mmio_ops,
+ .format_group = &adl_uncore_imc_format_group,
+};
+
+enum perf_adl_uncore_imc_freerunning_types {
+ ADL_MMIO_UNCORE_IMC_DATA_TOTAL,
+ ADL_MMIO_UNCORE_IMC_DATA_READ,
+ ADL_MMIO_UNCORE_IMC_DATA_WRITE,
+ ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX
+};
+
+static struct freerunning_counters adl_uncore_imc_freerunning[] = {
+ [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 },
+};
+
+static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE);
+}
+
+static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = {
+ .init_box = adl_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type adl_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE,
+ .freerunning = adl_uncore_imc_freerunning,
+ .ops = &adl_uncore_imc_freerunning_ops,
+ .event_descs = tgl_uncore_imc_events,
+ .format_group = &tgl_uncore_imc_format_group,
+};
+
+static struct intel_uncore_type *adl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void adl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = adl_mmio_uncores;
+}
+
+/* end of Alder Lake MMIO uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 3660f698fb2a..ed869443efb2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5482,7 +5482,7 @@ static struct intel_uncore_type icx_uncore_imc = {
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
- .event_descs = hswep_uncore_imc_events,
+ .event_descs = snr_uncore_imc_events,
.perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
.event_ctl = SNR_IMC_MMIO_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5480db242083..150261d929b9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */
#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */
#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */
+
#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
@@ -215,7 +215,8 @@ enum {
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_INFO = 0x05,
LBR_FORMAT_TIME = 0x06,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
+ LBR_FORMAT_INFO2 = 0x07,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2,
};
enum {
@@ -840,6 +841,11 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
+ unsigned int lbr_has_info:1;
+ unsigned int lbr_has_tsx:1;
+ unsigned int lbr_from_flags:1;
+ unsigned int lbr_to_cycles:1;
+
/*
* Intel Architectural LBR CPUID Enumeration
*/
@@ -1392,6 +1398,8 @@ void intel_pmu_lbr_init_skl(void);
void intel_pmu_lbr_init_knl(void);
+void intel_pmu_lbr_init(void);
+
void intel_pmu_arch_lbr_init(void);
void intel_pmu_pebs_data_source_nhm(void);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 85feafacc445..77e3a47af5ad 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -536,11 +536,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
* - perf_msr_probe(PERF_RAPL_MAX)
* - want to use same event codes across both architectures
*/
-static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = {
- [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
+static struct perf_msr amd_rapl_msrs[] = {
+ [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 },
+ [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 },
+ [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 },
+ [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 },
};
-
static int rapl_cpu_offline(unsigned int cpu)
{
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 96eb7db31c8e..8b392b6b7b93 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -28,6 +28,7 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
+#include <linux/swiotlb.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -36,7 +37,7 @@ EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
-union hv_ghcb __percpu **hv_ghcb_pg;
+union hv_ghcb * __percpu *hv_ghcb_pg;
/* Storage to save the hypercall page temporarily for hibernation */
static void *hv_hypercall_pg_saved;
@@ -498,6 +499,17 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+
+#ifdef CONFIG_SWIOTLB
+ /*
+ * Swiotlb bounce buffer needs to be mapped in extra address
+ * space. Map function doesn't work in the early place and so
+ * call swiotlb_update_mem_attributes() here.
+ */
+ if (hv_is_isolation_supported())
+ swiotlb_update_mem_attributes();
+#endif
+
return;
clean_guest_os_id:
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 514fc64e23d5..7e0f6bedc248 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -253,64 +253,43 @@ static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry
return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
}
-static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
{
- u64 status;
struct hv_interrupt_entry old_entry;
- struct irq_desc *desc;
- struct irq_data *data;
struct msi_msg msg;
+ u64 status;
- desc = irq_to_desc(irq);
- if (!desc) {
- pr_debug("%s: no irq desc\n", __func__);
- return;
- }
-
- data = &desc->irq_data;
- if (!data) {
- pr_debug("%s: no irq data\n", __func__);
- return;
- }
-
- if (!data->chip_data) {
+ if (!irqd->chip_data) {
pr_debug("%s: no chip data\n!", __func__);
return;
}
- old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+ old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
entry_to_msi_msg(&old_entry, &msg);
- kfree(data->chip_data);
- data->chip_data = NULL;
+ kfree(irqd->chip_data);
+ irqd->chip_data = NULL;
status = hv_unmap_msi_interrupt(dev, &old_entry);
- if (status != HV_STATUS_SUCCESS) {
+ if (status != HV_STATUS_SUCCESS)
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
- return;
- }
}
-static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void hv_msi_free_irq(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
{
- int i;
- struct msi_desc *entry;
- struct pci_dev *pdev;
+ struct irq_data *irqd = irq_get_irq_data(virq);
+ struct msi_desc *desc;
- if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ if (!irqd)
return;
- pdev = to_pci_dev(dev);
+ desc = irq_data_get_msi_desc(irqd);
+ if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev)))
+ return;
- for_each_pci_msi_entry(entry, pdev) {
- if (entry->irq) {
- for (i = 0; i < entry->nvec_used; i++) {
- hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
- irq_domain_free_irqs(entry->irq + i, 1);
- }
- }
- }
+ hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
}
/*
@@ -329,7 +308,7 @@ static struct irq_chip hv_pci_msi_controller = {
};
static struct msi_domain_ops pci_msi_domain_ops = {
- .domain_free_irqs = hv_msi_domain_free_irqs,
+ .msi_free = hv_msi_free_irq,
.msi_prepare = pci_msi_prepare,
};
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 69c7a57f3307..2b994117581e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -287,3 +287,31 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl
kfree(pfn_array);
return ret;
}
+
+/*
+ * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
+ */
+void *hv_map_memory(void *addr, unsigned long size)
+{
+ unsigned long *pfns = kcalloc(size / PAGE_SIZE,
+ sizeof(unsigned long), GFP_KERNEL);
+ void *vaddr;
+ int i;
+
+ if (!pfns)
+ return NULL;
+
+ for (i = 0; i < size / PAGE_SIZE; i++)
+ pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
+ (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
+
+ vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
+ kfree(pfns);
+
+ return vaddr;
+}
+
+void hv_unmap_memory(void *addr)
+{
+ vunmap(addr);
+}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index bd13736d0c05..0ad2378fe6ad 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -68,15 +68,6 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
local_irq_save(flags);
- /*
- * Only check the mask _after_ interrupt has been disabled to avoid the
- * mask changing under our feet.
- */
- if (cpumask_empty(cpus)) {
- local_irq_restore(flags);
- return;
- }
-
flush_pcpu = (struct hv_tlb_flush **)
this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -115,7 +106,9 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
* must. We will also check all VP numbers when walking the
* supplied CPU set to remain correct in all cases.
*/
- if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64)
+ cpu = cpumask_last(cpus);
+
+ if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64)
goto do_ex_hypercall;
for_each_cpu(cpu, cpus) {
@@ -131,6 +124,12 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
__set_bit(vcpu, (unsigned long *)
&flush->processor_mask);
}
+
+ /* nothing to flush if 'processor_mask' ends up being empty */
+ if (!flush->processor_mask) {
+ local_irq_restore(flags);
+ return;
+ }
}
/*
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 455066a06f60..00d1a400b7a1 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -24,7 +24,6 @@ extern int amd_set_subcaches(int, unsigned long);
extern int amd_smn_read(u16 node, u32 address, u32 *value);
extern int amd_smn_write(u16 node, u32 address, u32 value);
-extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
struct amd_l3_cache {
unsigned indices;
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 3a168483bc8e..c878fed3056f 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -152,6 +152,33 @@
#else /* ! __ASSEMBLY__ */
+# define DEFINE_EXTABLE_TYPE_REG \
+ ".macro extable_type_reg type:req reg:req\n" \
+ ".set found, 0\n" \
+ ".set regnr, 0\n" \
+ ".irp rs,rax,rcx,rdx,rbx,rsp,rbp,rsi,rdi,r8,r9,r10,r11,r12,r13,r14,r15\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set found, found+1\n" \
+ ".long \\type + (regnr << 8)\n" \
+ ".endif\n" \
+ ".set regnr, regnr+1\n" \
+ ".endr\n" \
+ ".set regnr, 0\n" \
+ ".irp rs,eax,ecx,edx,ebx,esp,ebp,esi,edi,r8d,r9d,r10d,r11d,r12d,r13d,r14d,r15d\n" \
+ ".ifc \\reg, %%\\rs\n" \
+ ".set found, found+1\n" \
+ ".long \\type + (regnr << 8)\n" \
+ ".endif\n" \
+ ".set regnr, regnr+1\n" \
+ ".endr\n" \
+ ".if (found != 1)\n" \
+ ".error \"extable_type_reg: bad register argument\"\n" \
+ ".endif\n" \
+ ".endm\n"
+
+# define UNDEFINE_EXTABLE_TYPE_REG \
+ ".purgem extable_type_reg\n"
+
# define _ASM_EXTABLE_TYPE(from, to, type) \
" .pushsection \"__ex_table\",\"a\"\n" \
" .balign 4\n" \
@@ -160,6 +187,16 @@
" .long " __stringify(type) " \n" \
" .popsection\n"
+# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
+ " .pushsection \"__ex_table\",\"a\"\n" \
+ " .balign 4\n" \
+ " .long (" #from ") - .\n" \
+ " .long (" #to ") - .\n" \
+ DEFINE_EXTABLE_TYPE_REG \
+ "extable_type_reg reg=" __stringify(reg) ", type=" __stringify(type) " \n"\
+ UNDEFINE_EXTABLE_TYPE_REG \
+ " .popsection\n"
+
/* For C file, we already have NOKPROBE_SYMBOL macro */
/*
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 3ba772a69cc8..35389b2af88e 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -19,9 +19,9 @@
#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \
X86_FEATURE_XMM2) ::: "memory", "cc")
#else
-#define mb() asm volatile("mfence":::"memory")
-#define rmb() asm volatile("lfence":::"memory")
-#define wmb() asm volatile("sfence" ::: "memory")
+#define __mb() asm volatile("mfence":::"memory")
+#define __rmb() asm volatile("lfence":::"memory")
+#define __wmb() asm volatile("sfence" ::: "memory")
#endif
/**
@@ -51,8 +51,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
/* Prevent speculative execution past this barrier. */
#define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC)
-#define dma_rmb() barrier()
-#define dma_wmb() barrier()
+#define __dma_rmb() barrier()
+#define __dma_wmb() barrier()
#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0367efdc5b7a..a288ecd230ab 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/fls64.h>
#endif
-#include <asm-generic/bitops/find.h>
-
#include <asm-generic/bitops/sched.h>
#include <asm/arch_hweight.h>
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d5b5f2ab87a0..6db4e2932b3d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -299,7 +299,9 @@
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_AMX_BF16 (18*32+22) /* AMX bf16 Support */
#define X86_FEATURE_AMX_TILE (18*32+24) /* AMX tile Support */
+#define X86_FEATURE_AMX_INT8 (18*32+25) /* AMX int8 Support */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
@@ -315,6 +317,7 @@
#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
+#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 4d0b126835b8..03cb12775043 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -46,13 +46,14 @@ extern unsigned long efi_mixed_mode_stack_pa;
#define __efi_nargs(...) __efi_nargs_(__VA_ARGS__)
#define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \
+ __efi_arg_sentinel(9), __efi_arg_sentinel(8), \
__efi_arg_sentinel(7), __efi_arg_sentinel(6), \
__efi_arg_sentinel(5), __efi_arg_sentinel(4), \
__efi_arg_sentinel(3), __efi_arg_sentinel(2), \
__efi_arg_sentinel(1), __efi_arg_sentinel(0))
-#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, n, ...) \
+#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, ...) \
__take_second_arg(n, \
- ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 8; }))
+ ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 10; }))
#define __efi_arg_sentinel(n) , n
/*
@@ -176,8 +177,9 @@ extern u64 efi_setup;
extern efi_status_t __efi64_thunk(u32, ...);
#define efi64_thunk(...) ({ \
- __efi_nargs_check(efi64_thunk, 6, __VA_ARGS__); \
- __efi64_thunk(__VA_ARGS__); \
+ u64 __pad[3]; /* must have space for 3 args on the stack */ \
+ __efi_nargs_check(efi64_thunk, 9, __VA_ARGS__); \
+ __efi64_thunk(__VA_ARGS__, __pad); \
})
static inline bool efi_is_mixed(void)
@@ -197,8 +199,6 @@ static inline bool efi_runtime_supported(void)
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
-extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-
extern void efi_thunk_runtime_setup(void);
efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -308,6 +308,10 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_query_mode(gop, mode, size, info) \
((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info))
+/* TCG2 protocol */
+#define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \
+ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev)
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
index 93f400eb728f..155c991ba95e 100644
--- a/arch/x86/include/asm/extable.h
+++ b/arch/x86/include/asm/extable.h
@@ -21,7 +21,7 @@
*/
struct exception_table_entry {
- int insn, fixup, type;
+ int insn, fixup, data;
};
struct pt_regs;
@@ -31,8 +31,8 @@ struct pt_regs;
do { \
(a)->fixup = (b)->fixup + (delta); \
(b)->fixup = (tmp).fixup - (delta); \
- (a)->type = (b)->type; \
- (b)->type = (tmp).type; \
+ (a)->data = (b)->data; \
+ (b)->data = (tmp).data; \
} while (0)
extern int fixup_exception(struct pt_regs *regs, int trapnr,
diff --git a/arch/x86/include/asm/extable_fixup_types.h b/arch/x86/include/asm/extable_fixup_types.h
index 409524d5d2eb..503622627400 100644
--- a/arch/x86/include/asm/extable_fixup_types.h
+++ b/arch/x86/include/asm/extable_fixup_types.h
@@ -2,6 +2,36 @@
#ifndef _ASM_X86_EXTABLE_FIXUP_TYPES_H
#define _ASM_X86_EXTABLE_FIXUP_TYPES_H
+/*
+ * Our IMM is signed, as such it must live at the top end of the word. Also,
+ * since C99 hex constants are of ambigious type, force cast the mask to 'int'
+ * so that FIELD_GET() will DTRT and sign extend the value when it extracts it.
+ */
+#define EX_DATA_TYPE_MASK ((int)0x000000FF)
+#define EX_DATA_REG_MASK ((int)0x00000F00)
+#define EX_DATA_FLAG_MASK ((int)0x0000F000)
+#define EX_DATA_IMM_MASK ((int)0xFFFF0000)
+
+#define EX_DATA_REG_SHIFT 8
+#define EX_DATA_FLAG_SHIFT 12
+#define EX_DATA_IMM_SHIFT 16
+
+#define EX_DATA_REG(reg) ((reg) << EX_DATA_REG_SHIFT)
+#define EX_DATA_FLAG(flag) ((flag) << EX_DATA_FLAG_SHIFT)
+#define EX_DATA_IMM(imm) ((imm) << EX_DATA_IMM_SHIFT)
+
+/* segment regs */
+#define EX_REG_DS EX_DATA_REG(8)
+#define EX_REG_ES EX_DATA_REG(9)
+#define EX_REG_FS EX_DATA_REG(10)
+#define EX_REG_GS EX_DATA_REG(11)
+
+/* flags */
+#define EX_FLAG_CLEAR_AX EX_DATA_FLAG(1)
+#define EX_FLAG_CLEAR_DX EX_DATA_FLAG(2)
+#define EX_FLAG_CLEAR_AX_DX EX_DATA_FLAG(3)
+
+/* types */
#define EX_TYPE_NONE 0
#define EX_TYPE_DEFAULT 1
#define EX_TYPE_FAULT 2
@@ -9,14 +39,29 @@
#define EX_TYPE_COPY 4
#define EX_TYPE_CLEAR_FS 5
#define EX_TYPE_FPU_RESTORE 6
-#define EX_TYPE_WRMSR 7
-#define EX_TYPE_RDMSR 8
-#define EX_TYPE_BPF 9
+#define EX_TYPE_BPF 7
+#define EX_TYPE_WRMSR 8
+#define EX_TYPE_RDMSR 9
+#define EX_TYPE_WRMSR_SAFE 10 /* reg := -EIO */
+#define EX_TYPE_RDMSR_SAFE 11 /* reg := -EIO */
+#define EX_TYPE_WRMSR_IN_MCE 12
+#define EX_TYPE_RDMSR_IN_MCE 13
+#define EX_TYPE_DEFAULT_MCE_SAFE 14
+#define EX_TYPE_FAULT_MCE_SAFE 15
+
+#define EX_TYPE_POP_REG 16 /* sp += sizeof(long) */
+#define EX_TYPE_POP_ZERO (EX_TYPE_POP_REG | EX_DATA_IMM(0))
+
+#define EX_TYPE_IMM_REG 17 /* reg := (long)imm */
+#define EX_TYPE_EFAULT_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(-EFAULT))
+#define EX_TYPE_ZERO_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(0))
+#define EX_TYPE_ONE_REG (EX_TYPE_IMM_REG | EX_DATA_IMM(1))
-#define EX_TYPE_WRMSR_IN_MCE 10
-#define EX_TYPE_RDMSR_IN_MCE 11
+#define EX_TYPE_FAULT_SGX 18
-#define EX_TYPE_DEFAULT_MCE_SAFE 12
-#define EX_TYPE_FAULT_MCE_SAFE 13
+#define EX_TYPE_UCOPY_LEN 19 /* cx := reg + imm*cx */
+#define EX_TYPE_UCOPY_LEN1 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(1))
+#define EX_TYPE_UCOPY_LEN4 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(4))
+#define EX_TYPE_UCOPY_LEN8 (EX_TYPE_UCOPY_LEN | EX_DATA_IMM(8))
#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 6053674f9132..c83b3020350a 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -102,12 +102,6 @@ extern void switch_fpu_return(void);
*/
extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
-/*
- * Tasks that are not using SVA have mm->pasid set to zero to note that they
- * will not have the valid bit set in MSR_IA32_PASID while they are running.
- */
-#define PASID_DISABLED 0
-
/* Trap handling */
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
extern void fpu_sync_fpstate(struct fpu *fpu);
@@ -138,10 +132,21 @@ static inline void fpstate_free(struct fpu *fpu) { }
/* fpstate-related functions which are exported to KVM */
extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature);
+extern u64 xstate_get_guest_group_perm(void);
+
/* KVM specific functions */
extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu);
extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu);
extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest);
+extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures);
+
+#ifdef CONFIG_X86_64
+extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd);
+extern void fpu_sync_guest_vmexit_xfd_state(void);
+#else
+static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { }
+static inline void fpu_sync_guest_vmexit_xfd_state(void) { }
+#endif
extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru);
extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru);
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 22b0273a8bf1..e1c9df9102a5 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -41,7 +41,4 @@ extern void fpu__clear_user_states(struct fpu *fpu);
extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
extern void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask);
-
-extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
-
#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c06c82ab355..eb7cd1139d97 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -387,6 +387,8 @@ struct fpstate {
/* @regs is dynamically sized! Don't add anything after @regs! */
} __aligned(64);
+#define FPU_GUEST_PERM_LOCKED BIT_ULL(63)
+
struct fpu_state_perm {
/*
* @__state_perm:
@@ -477,6 +479,13 @@ struct fpu {
struct fpu_state_perm perm;
/*
+ * @guest_perm:
+ *
+ * Permission related information for guest pseudo FPUs
+ */
+ struct fpu_state_perm guest_perm;
+
+ /*
* @__fpstate:
*
* Initial in-memory storage for FPU registers which are saved in
@@ -496,6 +505,29 @@ struct fpu {
*/
struct fpu_guest {
/*
+ * @xfeatures: xfeature bitmap of features which are
+ * currently enabled for the guest vCPU.
+ */
+ u64 xfeatures;
+
+ /*
+ * @perm: xfeature bitmap of features which are
+ * permitted to be enabled for the guest
+ * vCPU.
+ */
+ u64 perm;
+
+ /*
+ * @xfd_err: Save the guest value.
+ */
+ u64 xfd_err;
+
+ /*
+ * @uabi_size: Size required for save/restore
+ */
+ unsigned int uabi_size;
+
+ /*
* @fpstate: Pointer to the allocated guest fpstate
*/
struct fpstate *fpstate;
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index f9c00110a69a..99d345b686fa 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -17,13 +17,9 @@ do { \
int oldval = 0, ret; \
asm volatile("1:\t" insn "\n" \
"2:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "3:\tmov\t%3, %1\n" \
- "\tjmp\t2b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %1) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
- : "i" (-EFAULT), "0" (oparg), "1" (0)); \
+ : "0" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -39,15 +35,11 @@ do { \
"3:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
"\tjnz\t2b\n" \
"4:\n" \
- "\t.section .fixup,\"ax\"\n" \
- "5:\tmov\t%5, %1\n" \
- "\tjmp\t4b\n" \
- "\t.previous\n" \
- _ASM_EXTABLE_UA(1b, 5b) \
- _ASM_EXTABLE_UA(3b, 5b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 4b, EX_TYPE_EFAULT_REG, %1) \
+ _ASM_EXTABLE_TYPE_REG(3b, 4b, EX_TYPE_EFAULT_REG, %1) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
- : "r" (oparg), "i" (-EFAULT), "1" (0)); \
+ : "r" (oparg), "1" (0)); \
if (ret) \
goto label; \
*oval = oldval; \
@@ -95,15 +87,11 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
if (!user_access_begin(uaddr, sizeof(u32)))
return -EFAULT;
asm volatile("\n"
- "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
+ "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"
"2:\n"
- "\t.section .fixup, \"ax\"\n"
- "3:\tmov %3, %0\n"
- "\tjmp 2b\n"
- "\t.previous\n"
- _ASM_EXTABLE_UA(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %0) \
: "+r" (ret), "=a" (oldval), "+m" (*uaddr)
- : "i" (-EFAULT), "r" (newval), "1" (oldval)
+ : "r" (newval), "1" (oldval)
: "memory"
);
user_access_end();
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 381e88122a5f..0a9407dc0859 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -602,6 +602,39 @@ enum hv_interrupt_type {
HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
};
+union hv_msi_address_register {
+ u32 as_uint32;
+ struct {
+ u32 reserved1:2;
+ u32 destination_mode:1;
+ u32 redirection_hint:1;
+ u32 reserved2:8;
+ u32 destination_id:8;
+ u32 msi_base:12;
+ };
+} __packed;
+
+union hv_msi_data_register {
+ u32 as_uint32;
+ struct {
+ u32 vector:8;
+ u32 delivery_mode:3;
+ u32 reserved1:3;
+ u32 level_assert:1;
+ u32 trigger_mode:1;
+ u32 reserved2:16;
+ };
+} __packed;
+
+/* HvRetargetDeviceInterrupt hypercall */
+union hv_msi_entry {
+ u64 as_uint64;
+ struct {
+ union hv_msi_address_register address;
+ union hv_msi_data_register data;
+ } __packed;
+};
+
#include <asm-generic/hyperv-tlfs.h>
#endif
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 4ec3613551e3..f07faa61c7f3 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -15,10 +15,13 @@
#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+int pt_regs_offset(struct pt_regs *regs, int regno);
+
bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip);
@@ -29,4 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs,
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
+enum mmio_type {
+ MMIO_DECODE_FAILED,
+ MMIO_WRITE,
+ MMIO_WRITE_IMM,
+ MMIO_READ,
+ MMIO_READ_ZERO_EXTEND,
+ MMIO_READ_SIGN_EXTEND,
+ MMIO_MOVS,
+};
+
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 5a0bcf8b78d7..048b6d5aff50 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -108,7 +108,7 @@
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
-#define INTEL_FAM6_RAPTOR_LAKE 0xB7
+#define INTEL_FAM6_RAPTORLAKE 0xB7
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 5c6a4af0b911..f6d91ecb8026 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/compiler.h>
+#include <linux/cc_platform.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
#include <asm/pgtable_types.h>
@@ -256,21 +257,6 @@ static inline void slow_down_io(void)
#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-#include <linux/jump_label.h>
-
-extern struct static_key_false sev_enable_key;
-static inline bool sev_key_active(void)
-{
- return static_branch_unlikely(&sev_enable_key);
-}
-
-#else /* !CONFIG_AMD_MEM_ENCRYPT */
-
-static inline bool sev_key_active(void) { return false; }
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
#define BUILDIO(bwl, bw, type) \
static inline void out##bwl(unsigned type value, int port) \
{ \
@@ -301,7 +287,7 @@ static inline unsigned type in##bwl##_p(int port) \
\
static inline void outs##bwl(int port, const void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
out##bwl(*value, port); \
@@ -317,7 +303,7 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \
\
static inline void ins##bwl(int port, void *addr, unsigned long count) \
{ \
- if (sev_key_active()) { \
+ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \
unsigned type *value = (unsigned type *)addr; \
while (count) { \
*value = in##bwl(port); \
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index c5ce9845c999..87761396e8cc 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -114,8 +114,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS pushfq; popq %rax
#endif
-#define INTERRUPT_RETURN jmp native_iret
-
#endif
#endif /* __ASSEMBLY__ */
@@ -143,8 +141,13 @@ static __always_inline void arch_local_irq_restore(unsigned long flags)
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PV
#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#define INTERRUPT_RETURN \
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
#else
#define SWAPGS swapgs
+#define INTERRUPT_RETURN jmp native_iret
#endif
#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index cefe1d81e2e8..631d5040b31e 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -35,6 +35,7 @@ KVM_X86_OP(get_cpl)
KVM_X86_OP(set_segment)
KVM_X86_OP_NULL(get_cs_db_l_bits)
KVM_X86_OP(set_cr0)
+KVM_X86_OP_NULL(post_set_cr3)
KVM_X86_OP(is_valid_cr4)
KVM_X86_OP(set_cr4)
KVM_X86_OP(set_efer)
@@ -47,12 +48,14 @@ KVM_X86_OP(set_dr7)
KVM_X86_OP(cache_reg)
KVM_X86_OP(get_rflags)
KVM_X86_OP(set_rflags)
+KVM_X86_OP(get_if_flag)
KVM_X86_OP(tlb_flush_all)
KVM_X86_OP(tlb_flush_current)
KVM_X86_OP_NULL(tlb_remote_flush)
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
KVM_X86_OP(tlb_flush_gva)
KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(vcpu_pre_run)
KVM_X86_OP(run)
KVM_X86_OP_NULL(handle_exit)
KVM_X86_OP_NULL(skip_emulated_instruction)
@@ -96,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(update_cpu_dirty_logging)
-KVM_X86_OP_NULL(pre_block)
-KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ac61f85e07b..6e7c545bc7ee 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
- KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
+ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
@@ -135,7 +135,7 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-#define KVM_PERMILLE_MMU_PAGES 20
+#define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
@@ -291,25 +291,31 @@ struct kvm_kernel_irq_routing_entry;
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
- * But, even though there are 18 bits in the mask below, not all combinations
- * of modes and flags are possible. The maximum number of possible upper-level
- * shadow pages for a single gfn is in the neighborhood of 2^13.
+ * But, even though there are 19 bits in the mask below, not all combinations
+ * of modes and flags are possible:
*
- * - invalid shadow pages are not accounted.
- * - level is effectively limited to four combinations, not 16 as the number
- * bits would imply, as 4k SPs are not tracked (allowed to go unsync).
- * - level is effectively unused for non-PAE paging because there is exactly
- * one upper level (see 4k SP exception above).
- * - quadrant is used only for non-PAE paging and is exclusive with
- * gpte_is_8_bytes.
- * - execonly and ad_disabled are used only for nested EPT, which makes it
- * exclusive with quadrant.
+ * - invalid shadow pages are not accounted, so the bits are effectively 18
+ *
+ * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
+ * execonly and ad_disabled are only used for nested EPT which has
+ * has_4_byte_gpte=0. Therefore, 2 bits are always unused.
+ *
+ * - the 4 bits of level are effectively limited to the values 2/3/4/5,
+ * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE
+ * paging has exactly one upper level, making level completely redundant
+ * when has_4_byte_gpte=1.
+ *
+ * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
+ * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ *
+ * Therefore, the maximum number of possible upper-level shadow pages for a
+ * single gfn is a bit less than 2^13.
*/
union kvm_mmu_page_role {
u32 word;
struct {
unsigned level:4;
- unsigned gpte_is_8_bytes:1;
+ unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
unsigned access:3;
@@ -420,10 +426,9 @@ struct kvm_mmu {
int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
- u32 access, struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t gva_or_gpa, u32 access,
+ struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
@@ -490,6 +495,7 @@ struct kvm_pmc {
*/
u64 current_config;
bool is_paused;
+ bool intr;
};
struct kvm_pmu {
@@ -604,6 +610,7 @@ struct kvm_vcpu_xen {
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
+ unsigned long evtchn_pending_sel;
};
struct kvm_vcpu_arch {
@@ -640,6 +647,7 @@ struct kvm_vcpu_arch {
u64 smi_count;
bool tpr_access_reporting;
bool xsaves_enabled;
+ bool xfd_no_write_intercept;
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
@@ -774,6 +782,7 @@ struct kvm_vcpu_arch {
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
+ u8 handling_intr_from_guest;
struct kvm_mtrr mtrr_state;
u64 pat;
@@ -1014,7 +1023,7 @@ struct msr_bitmap_range {
struct kvm_xen {
bool long_mode;
u8 upcall_vector;
- gfn_t shinfo_gfn;
+ struct gfn_to_pfn_cache shinfo_cache;
};
enum kvm_irqchip_mode {
@@ -1036,6 +1045,7 @@ struct kvm_x86_msr_filter {
#define APICV_INHIBIT_REASON_PIT_REINJ 4
#define APICV_INHIBIT_REASON_X2APIC 5
#define APICV_INHIBIT_REASON_BLOCKIRQ 6
+#define APICV_INHIBIT_REASON_ABSENT 7
struct kvm_arch {
unsigned long n_used_mmu_pages;
@@ -1336,6 +1346,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
+ void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
@@ -1348,6 +1359,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ bool (*get_if_flag)(struct kvm_vcpu *vcpu);
void (*tlb_flush_all)(struct kvm_vcpu *vcpu);
void (*tlb_flush_current)(struct kvm_vcpu *vcpu);
@@ -1369,6 +1381,7 @@ struct kvm_x86_ops {
*/
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
@@ -1442,18 +1455,6 @@ struct kvm_x86_ops {
const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
- /*
- * Architecture specific hooks for vCPU blocking due to
- * HLT instruction.
- * Returns for .pre_block():
- * - 0 means continue to block the vCPU.
- * - 1 means we cannot block the vCPU since some event
- * happens during this period, such as, 'ON' bit in
- * posted-interrupts descriptor is set.
- */
- int (*pre_block)(struct kvm_vcpu *vcpu);
- void (*post_block)(struct kvm_vcpu *vcpu);
-
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
@@ -1482,7 +1483,8 @@ struct kvm_x86_ops {
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
+ bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1495,6 +1497,7 @@ struct kvm_x86_ops {
};
struct kvm_x86_nested_ops {
+ void (*leave_nested)(struct kvm_vcpu *vcpu);
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
@@ -1517,6 +1520,7 @@ struct kvm_x86_init_ops {
int (*disabled_by_bios)(void);
int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
+ unsigned int (*handle_intel_pt_intr)(void);
struct kvm_x86_ops *runtime_ops;
};
@@ -1566,6 +1570,9 @@ static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
return -ENOTSUPP;
}
+#define kvm_arch_pmi_in_guest(vcpu) \
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+
int kvm_mmu_module_init(void);
void kvm_mmu_module_exit(void);
@@ -1585,10 +1592,9 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -1638,7 +1644,8 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
* decode the instruction length. For use *only* by
- * kvm_x86_ops.skip_emulated_instruction() implementations.
+ * kvm_x86_ops.skip_emulated_instruction() implementations if
+ * EMULTYPE_COMPLETE_USER_EXIT is not set.
*
* EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
* retry native execution under certain conditions,
@@ -1658,6 +1665,10 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
* case the CR2/GPA value pass on the stack is valid.
+ *
+ * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
+ * state and inject single-step #DBs after skipping
+ * an instruction (after completing userspace I/O).
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -1666,6 +1677,7 @@ extern u64 kvm_mce_cap_supported;
#define EMULTYPE_TRAP_UD_FORCED (1 << 4)
#define EMULTYPE_VMWARE_GP (1 << 5)
#define EMULTYPE_PF (1 << 6)
+#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
@@ -1690,7 +1702,7 @@ int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
@@ -1756,12 +1768,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu);
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1854,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
@@ -1895,8 +1903,6 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
-int kvm_is_in_guest(void);
-
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
@@ -1925,8 +1931,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
-static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
-
static inline int kvm_cpu_get_apicid(int mps_cpu)
{
#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index 9d4a3b1b25b9..eb186bc57f6a 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -63,9 +63,9 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
void kvm_slot_page_track_remove_page(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode);
-bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode);
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode);
void
kvm_page_track_register_notifier(struct kvm *kvm,
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 365111789cc6..030907922bd0 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -18,6 +18,20 @@
#define __ALIGN_STR __stringify(__ALIGN)
#endif
+#ifdef CONFIG_SLS
+#define RET ret; int3
+#else
+#define RET ret
+#endif
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SLS
+#define ASM_RET "ret; int3\n\t"
+#else
+#define ASM_RET "ret\n\t"
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_LINKAGE_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8f6395d9e209..cc73061e7255 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -313,31 +313,22 @@ enum smca_bank_types {
SMCA_SMU, /* System Management Unit */
SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_MPDMA, /* MPDMA Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
SMCA_PCIE_V2,
SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_NBIF, /* NBIF Unit */
+ SMCA_SHUB, /* System HUB Unit */
+ SMCA_SATA, /* SATA Unit */
+ SMCA_USB, /* USB Unit */
+ SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
+ SMCA_GMI_PHY, /* GMI PHY Unit */
N_SMCA_BANK_TYPES
};
-#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-
-struct smca_hwid {
- unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
- u32 hwid_mcatype; /* (hwid,mcatype) tuple */
- u8 count; /* Number of instances. */
-};
-
-struct smca_bank {
- struct smca_hwid *hwid;
- u32 id; /* Value of MCA_IPID[InstanceId]. */
- u8 sysfs_id; /* Value used for sysfs name. */
-};
-
-extern struct smca_bank smca_banks[MAX_NR_BANKS];
-
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
@@ -345,16 +336,13 @@ extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
void mce_amd_feature_init(struct cpuinfo_x86 *c);
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
-enum smca_bank_types smca_get_bank_type(unsigned int bank);
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank);
#else
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
-static inline int
-umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
#endif
static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
index f572d0f944bb..e69de29bb2d1 100644
--- a/arch/x86/include/asm/mmx.h
+++ b/arch/x86/include/asm/mmx.h
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MMX_H
-#define _ASM_X86_MMX_H
-
-/*
- * MMX 3Dnow! helper operations
- */
-
-#include <linux/types.h>
-
-extern void *_mmx_memcpy(void *to, const void *from, size_t size);
-extern void mmx_clear_page(void *page);
-extern void mmx_copy_page(void *to, void *from);
-
-#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index da3972fe5a7a..a82f603d4312 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -30,7 +30,7 @@ extern void *hv_hypercall_pg;
extern u64 hv_current_partition_id;
-extern union hv_ghcb __percpu **hv_ghcb_pg;
+extern union hv_ghcb * __percpu *hv_ghcb_pg;
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
@@ -169,13 +169,6 @@ bool hv_vcpu_is_preempted(int vcpu);
static inline void hv_apic_init(void) {}
#endif
-static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
- struct msi_desc *msi_desc)
-{
- msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
- msi_entry->data.as_uint32 = msi_desc->msg.data;
-}
-
struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 01e2650b9585..3faf0f97edb1 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -486,6 +486,23 @@
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
+/* AMD Collaborative Processor Performance Control MSRs */
+#define MSR_AMD_CPPC_CAP1 0xc00102b0
+#define MSR_AMD_CPPC_ENABLE 0xc00102b1
+#define MSR_AMD_CPPC_CAP2 0xc00102b2
+#define MSR_AMD_CPPC_REQ 0xc00102b3
+#define MSR_AMD_CPPC_STATUS 0xc00102b4
+
+#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff)
+#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff)
+#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff)
+#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff)
+
+#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0)
+#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8)
+#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16)
+#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24)
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 6b52182e178a..d42e6c6b47b1 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -137,17 +137,11 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
{
DECLARE_ARGS(val, low, high);
- asm volatile("2: rdmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err]\n\t"
- "xorl %%eax, %%eax\n\t"
- "xorl %%edx, %%edx\n\t"
- "jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: rdmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
: [err] "=r" (*err), EAX_EDX_RET(val, low, high)
- : "c" (msr), [fault] "i" (-EIO));
+ : "c" (msr));
if (tracepoint_enabled(read_msr))
do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err);
return EAX_EDX_VAL(val, low, high);
@@ -169,15 +163,11 @@ native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
int err;
- asm volatile("2: wrmsr ; xor %[err],%[err]\n"
- "1:\n\t"
- ".section .fixup,\"ax\"\n\t"
- "3: mov %[fault],%[err] ; jmp 1b\n\t"
- ".previous\n\t"
- _ASM_EXTABLE(2b, 3b)
+ asm volatile("1: wrmsr ; xor %[err],%[err]\n"
+ "2:\n\t"
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err])
: [err] "=a" (err)
- : "c" (msr), "0" (low), "d" (high),
- [fault] "i" (-EIO)
+ : "c" (msr), "0" (low), "d" (high)
: "memory");
if (tracepoint_enabled(write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), err);
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 829df26fd7a3..76d726074c16 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,8 +24,8 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
-#include <asm/memtype.h>
+void mtrr_bp_init(void);
/*
* The following functions are for use by other drivers that cannot use
@@ -43,7 +43,6 @@ extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
extern void mtrr_ap_init(void);
-extern void mtrr_bp_init(void);
extern void set_mtrr_aps_delayed_init(void);
extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
@@ -84,11 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-static inline void mtrr_bp_init(void)
-{
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-}
-
#define mtrr_ap_init() do {} while (0)
#define set_mtrr_aps_delayed_init() do {} while (0)
#define mtrr_aps_init() do {} while (0)
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index b13f8488ac85..df42f8aa99e4 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -19,19 +19,6 @@ extern unsigned long __phys_addr(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_mapnr)
#endif /* CONFIG_FLATMEM */
-#ifdef CONFIG_X86_USE_3DNOW
-#include <asm/mmx.h>
-
-static inline void clear_page(void *page)
-{
- mmx_clear_page(page);
-}
-
-static inline void copy_page(void *to, void *from)
-{
- mmx_copy_page(to, from);
-}
-#else /* !CONFIG_X86_USE_3DNOW */
#include <linux/string.h>
static inline void clear_page(void *page)
@@ -43,7 +30,6 @@ static inline void copy_page(void *to, void *from)
{
memcpy(to, from, PAGE_SIZE);
}
-#endif /* CONFIG_X86_USE_3DNOW */
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 4bde0dc66100..e9c86299b835 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -5,6 +5,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 21c4a694ca11..0d76502cc6f5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -671,7 +671,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
FRAME_END \
- "ret;" \
+ ASM_RET \
".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \
".popsection")
@@ -752,11 +752,6 @@ extern void default_banner(void);
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#define INTERRUPT_RETURN \
- ANNOTATE_RETPOLINE_SAFE; \
- ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
- X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-
#ifdef CONFIG_DEBUG_ENTRY
.macro PARA_IRQ_save_fl
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 448cd01eb3ec..8a9432fb3802 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -22,10 +22,12 @@
#define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot)))
#ifndef __ASSEMBLY__
+#include <linux/spinlock.h>
#include <asm/x86_init.h>
#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>
+#include <linux/page_table_check.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
@@ -752,7 +754,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
- mm_tlb_flush_pending(mm))
+ atomic_read(&mm->tlb_flush_pending))
return true;
return false;
@@ -1006,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
+ page_table_check_pte_set(mm, addr, ptep, pte);
set_pte(ptep, pte);
}
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(mm, addr, pmdp, pmd);
set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
+ page_table_check_pud_set(mm, addr, pudp, pud);
native_set_pud(pudp, pud);
}
@@ -1048,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_t pte = native_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
return pte;
}
@@ -1063,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
* care about updates and native needs no locking
*/
pte = native_local_ptep_get_and_clear(ptep);
+ page_table_check_pte_clear(mm, addr, pte);
} else {
pte = ptep_get_and_clear(mm, addr, ptep);
}
return pte;
}
+#define __HAVE_ARCH_PTEP_CLEAR
+static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
+{
+ if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
+ ptep_get_and_clear(mm, addr, ptep);
+ else
+ pte_clear(mm, addr, ptep);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
@@ -1109,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd)
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
{
- return native_pmdp_get_and_clear(pmdp);
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+
+ page_table_check_pmd_clear(mm, addr, pmd);
+
+ return pmd;
}
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- return native_pudp_get_and_clear(pudp);
+ pud_t pud = native_pudp_get_and_clear(pudp);
+
+ page_table_check_pud_clear(mm, addr, pud);
+
+ return pud;
}
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
@@ -1137,6 +1162,7 @@ static inline int pud_write(pud_t pud)
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
+ page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
if (IS_ENABLED(CONFIG_SMP)) {
return xchg(pmdp, pmd);
} else {
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
index 4cd49afa0ca4..74f0a2d34ffd 100644
--- a/arch/x86/include/asm/pkru.h
+++ b/arch/x86/include/asm/pkru.h
@@ -4,8 +4,8 @@
#include <asm/cpufeature.h>
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
+#define PKRU_AD_BIT 0x1u
+#define PKRU_WD_BIT 0x2u
#define PKRU_BITS_PER_PKEY 2
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 355d38c0cf60..2c5f12ae7d04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -855,4 +855,12 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
+#ifdef CONFIG_X86_SGX
+int arch_memory_failure(unsigned long pfn, int flags);
+#define arch_memory_failure arch_memory_failure
+
+bool arch_is_platform_page(u64 paddr);
+#define arch_is_platform_page arch_is_platform_page
+#endif
+
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index d86ab942219c..d87451df480b 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -53,6 +53,7 @@ static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
static inline void queued_spin_unlock(struct qspinlock *lock)
{
+ kcsan_release();
pv_queued_spin_unlock(lock);
}
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 159622ee0674..1474cf96251d 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -48,7 +48,7 @@ asm (".pushsection .text;"
"jne .slowpath;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".slowpath: "
"push %rsi;"
"movzbl %al,%esi;"
@@ -56,7 +56,7 @@ asm (".pushsection .text;"
"pop %rsi;"
"pop %rdx;"
FRAME_END
- "ret;"
+ ASM_RET
".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
".popsection");
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 5db5d083c873..331474b150f1 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
}
void reserve_real_mode(void);
+void load_trampoline_pgtable(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index b2d504f11937..aff774775c67 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -35,11 +35,7 @@
# define NEED_CMOV 0
#endif
-#ifdef CONFIG_X86_USE_3DNOW
-# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
-#else
# define NEED_3DNOW 0
-#endif
#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 8dd8e8ec9fa5..b228c9d44ee7 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -307,14 +307,7 @@ do { \
\
asm volatile(" \n" \
"1: movl %k0,%%" #seg " \n" \
- \
- ".section .fixup,\"ax\" \n" \
- "2: xorl %k0,%k0 \n" \
- " jmp 1b \n" \
- ".previous \n" \
- \
- _ASM_EXTABLE(1b, 2b) \
- \
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\
: "+r" (__val) : : "memory"); \
} while (0)
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 872617542bbc..ff0f2d90338a 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_SET_MEMORY_H
#define _ASM_X86_SET_MEMORY_H
+#include <linux/mm.h>
#include <asm/page.h>
#include <asm-generic/set_memory.h>
@@ -99,6 +100,9 @@ static inline int set_mce_nospec(unsigned long pfn, bool unmap)
unsigned long decoy_addr;
int rc;
+ /* SGX pages are not in the 1:1 map */
+ if (arch_is_platform_page(pfn << PAGE_SHIFT))
+ return 0;
/*
* We would like to just call:
* set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 2cef6c5a52c2..1b2fd32b42fe 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -18,20 +18,19 @@
/* SEV Information Request/Response */
#define GHCB_MSR_SEV_INFO_RESP 0x001
#define GHCB_MSR_SEV_INFO_REQ 0x002
-#define GHCB_MSR_VER_MAX_POS 48
-#define GHCB_MSR_VER_MAX_MASK 0xffff
-#define GHCB_MSR_VER_MIN_POS 32
-#define GHCB_MSR_VER_MIN_MASK 0xffff
-#define GHCB_MSR_CBIT_POS 24
-#define GHCB_MSR_CBIT_MASK 0xff
-#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
- ((((_max) & GHCB_MSR_VER_MAX_MASK) << GHCB_MSR_VER_MAX_POS) | \
- (((_min) & GHCB_MSR_VER_MIN_MASK) << GHCB_MSR_VER_MIN_POS) | \
- (((_cbit) & GHCB_MSR_CBIT_MASK) << GHCB_MSR_CBIT_POS) | \
+
+#define GHCB_MSR_SEV_INFO(_max, _min, _cbit) \
+ /* GHCBData[63:48] */ \
+ ((((_max) & 0xffff) << 48) | \
+ /* GHCBData[47:32] */ \
+ (((_min) & 0xffff) << 32) | \
+ /* GHCBData[31:24] */ \
+ (((_cbit) & 0xff) << 24) | \
GHCB_MSR_SEV_INFO_RESP)
+
#define GHCB_MSR_INFO(v) ((v) & 0xfffUL)
-#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
-#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+#define GHCB_MSR_PROTO_MAX(v) (((v) >> 48) & 0xffff)
+#define GHCB_MSR_PROTO_MIN(v) (((v) >> 32) & 0xffff)
/* CPUID Request/Response */
#define GHCB_MSR_CPUID_REQ 0x004
@@ -46,31 +45,48 @@
#define GHCB_CPUID_REQ_EBX 1
#define GHCB_CPUID_REQ_ECX 2
#define GHCB_CPUID_REQ_EDX 3
-#define GHCB_CPUID_REQ(fn, reg) \
- (GHCB_MSR_CPUID_REQ | \
- (((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
- (((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+#define GHCB_CPUID_REQ(fn, reg) \
+ /* GHCBData[11:0] */ \
+ (GHCB_MSR_CPUID_REQ | \
+ /* GHCBData[31:12] */ \
+ (((unsigned long)(reg) & 0x3) << 30) | \
+ /* GHCBData[63:32] */ \
+ (((unsigned long)fn) << 32))
/* AP Reset Hold */
-#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
-#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
/* GHCB Hypervisor Feature Request/Response */
-#define GHCB_MSR_HV_FT_REQ 0x080
-#define GHCB_MSR_HV_FT_RESP 0x081
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
#define GHCB_MSR_TERM_REASON_POS 16
#define GHCB_MSR_TERM_REASON_MASK 0xff
-#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
- (((((u64)reason_set) & GHCB_MSR_TERM_REASON_SET_MASK) << GHCB_MSR_TERM_REASON_SET_POS) | \
- ((((u64)reason_val) & GHCB_MSR_TERM_REASON_MASK) << GHCB_MSR_TERM_REASON_POS))
-#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
-#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+#define GHCB_SEV_TERM_REASON(reason_set, reason_val) \
+ /* GHCBData[15:12] */ \
+ (((((u64)reason_set) & 0xf) << 12) | \
+ /* GHCBData[23:16] */ \
+ ((((u64)reason_val) & 0xff) << 16))
+
+#define GHCB_SEV_ES_GEN_REQ 0
+#define GHCB_SEV_ES_PROT_UNSUPPORTED 1
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
+/*
+ * Error codes related to GHCB input that can be communicated back to the guest
+ * by setting the lower 32-bits of the GHCB SW_EXITINFO1 field to 2.
+ */
+#define GHCB_ERR_NOT_REGISTERED 1
+#define GHCB_ERR_INVALID_USAGE 2
+#define GHCB_ERR_INVALID_SCRATCH_AREA 3
+#define GHCB_ERR_MISSING_INPUT 4
+#define GHCB_ERR_INVALID_INPUT 5
+#define GHCB_ERR_INVALID_EVENT 6
+
#endif
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 05f3e21f01a7..3f9334ef67cd 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -46,6 +46,24 @@ enum sgx_encls_function {
};
/**
+ * SGX_ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define SGX_ENCLS_FAULT_FLAG 0x40000000
+
+/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index 39ebe0511869..ed4f8bb6c2d9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -36,7 +36,7 @@
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
- __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
#define ARCH_ADD_TRAMP_KEY(name) \
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index f74362b05619..32c0d981a82a 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -146,42 +146,9 @@ static __always_inline void *__constant_memcpy(void *to, const void *from,
extern void *memcpy(void *, const void *, size_t);
#ifndef CONFIG_FORTIFY_SOURCE
-#ifdef CONFIG_X86_USE_3DNOW
-
-#include <asm/mmx.h>
-
-/*
- * This CPU favours 3DNow strongly (eg AMD Athlon)
- */
-
-static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __constant_memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-static inline void *__memcpy3d(void *to, const void *from, size_t len)
-{
- if (len < 512)
- return __memcpy(to, from, len);
- return _mmx_memcpy(to, from, len);
-}
-
-#define memcpy(t, f, n) \
- (__builtin_constant_p((n)) \
- ? __constant_memcpy3d((t), (f), (n)) \
- : __memcpy3d((t), (f), (n)))
-
-#else
-
-/*
- * No 3D Now!
- */
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMMOVE
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b587a9ee9cb2..98fa0a114074 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#endif /* !MODULE */
+static inline void __native_tlb_flush_global(unsigned long cr4)
+{
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ native_write_cr4(cr4);
+}
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index cc164777e661..2f0b6be8eaab 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -221,7 +221,7 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled)
}
#endif
-#ifdef CONFIG_ACPI_CPPC_LIB
+#if defined(CONFIG_ACPI_CPPC_LIB) && defined(CONFIG_SMP)
void init_freq_invariance_cppc(void);
#define init_freq_invariance_cppc init_freq_invariance_cppc
#endif
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 33a68407def3..ac96f9b2d64b 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -314,11 +314,12 @@ do { \
do { \
__chk_user_ptr(ptr); \
switch (size) { \
- unsigned char x_u8__; \
- case 1: \
+ case 1: { \
+ unsigned char x_u8__; \
__get_user_asm(x_u8__, ptr, "b", "=q", label); \
(x) = x_u8__; \
break; \
+ } \
case 2: \
__get_user_asm(x, ptr, "w", "=r", label); \
break; \
@@ -351,24 +352,22 @@ do { \
"1: movl %[lowbits],%%eax\n" \
"2: movl %[highbits],%%edx\n" \
"3:\n" \
- ".section .fixup,\"ax\"\n" \
- "4: mov %[efault],%[errout]\n" \
- " xorl %%eax,%%eax\n" \
- " xorl %%edx,%%edx\n" \
- " jmp 3b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 4b) \
- _ASM_EXTABLE_UA(2b, 4b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
+ _ASM_EXTABLE_TYPE_REG(2b, 3b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX_DX, \
+ %[errout]) \
: [errout] "=r" (retval), \
[output] "=&A"(x) \
: [lowbits] "m" (__m(__ptr)), \
[highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \
- [efault] "i" (-EFAULT), "0" (retval)); \
+ "0" (retval)); \
})
#else
#define __get_user_asm_u64(x, ptr, retval) \
- __get_user_asm(x, ptr, retval, "q", "=r")
+ __get_user_asm(x, ptr, retval, "q")
#endif
#define __get_user_size(x, ptr, size, retval) \
@@ -379,14 +378,14 @@ do { \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
- __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \
+ __get_user_asm(x_u8__, ptr, retval, "b"); \
(x) = x_u8__; \
break; \
case 2: \
- __get_user_asm(x, ptr, retval, "w", "=r"); \
+ __get_user_asm(x, ptr, retval, "w"); \
break; \
case 4: \
- __get_user_asm(x, ptr, retval, "l", "=r"); \
+ __get_user_asm(x, ptr, retval, "l"); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval); \
@@ -396,20 +395,17 @@ do { \
} \
} while (0)
-#define __get_user_asm(x, addr, err, itype, ltype) \
+#define __get_user_asm(x, addr, err, itype) \
asm volatile("\n" \
"1: mov"itype" %[umem],%[output]\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: mov %[efault],%[errout]\n" \
- " xorl %k[output],%k[output]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG | \
+ EX_FLAG_CLEAR_AX, \
+ %[errout]) \
: [errout] "=r" (err), \
- [output] ltype(x) \
+ [output] "=a" (x) \
: [umem] "m" (__m(addr)), \
- [efault] "i" (-EFAULT), "0" (err))
+ "0" (err))
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index 06006b0351f3..8338b0432b50 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -77,30 +77,58 @@ static inline unsigned long find_zero(unsigned long mask)
* and the next page not being mapped, take the exception and
* return zeroes in the non-existing part.
*/
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
- unsigned long ret, dummy;
+ unsigned long offset, data;
+ unsigned long ret;
+
+ asm_volatile_goto(
+ "1: mov %[mem], %[ret]\n"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [ret] "=r" (ret)
+ : [mem] "m" (*(unsigned long *)addr)
+ : : do_exception);
+
+ return ret;
+
+do_exception:
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+
+ return ret;
+}
- asm(
- "1:\tmov %2,%0\n"
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
+static inline unsigned long load_unaligned_zeropad(const void *addr)
+{
+ unsigned long offset, data;
+ unsigned long ret, err = 0;
+
+ asm( "1: mov %[mem], %[ret]\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3:\t"
- "lea %2,%1\n\t"
- "and %3,%1\n\t"
- "mov (%1),%0\n\t"
- "leal %2,%%ecx\n\t"
- "andl %4,%%ecx\n\t"
- "shll $3,%%ecx\n\t"
- "shr %%cl,%0\n\t"
- "jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- :"=&r" (ret),"=&c" (dummy)
- :"m" (*(unsigned long *)addr),
- "i" (-sizeof(unsigned long)),
- "i" (sizeof(unsigned long)-1));
+
+ _ASM_EXTABLE_FAULT(1b, 2b)
+
+ : [ret] "=&r" (ret), "+a" (err)
+ : [mem] "m" (*(unsigned long *)addr));
+
+ if (unlikely(err)) {
+ offset = (unsigned long)addr & (sizeof(long) - 1);
+ addr = (void *)((unsigned long)addr & ~(sizeof(long) - 1));
+ data = *(unsigned long *)addr;
+ ret = data >> offset * 8;
+ }
+
return ret;
}
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
#endif /* _ASM_WORD_AT_A_TIME_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5c69f7eb5d47..22b7412c08f6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -289,12 +289,6 @@ struct x86_platform_ops {
struct x86_hyper_runtime hyper;
};
-struct pci_dev;
-
-struct x86_msi_ops {
- void (*restore_msi_irqs)(struct pci_dev *dev);
-};
-
struct x86_apic_ops {
unsigned int (*io_apic_read) (unsigned int apic, unsigned int reg);
void (*restore)(void);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 0575f5863b7f..e5e0fe10c692 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -281,13 +281,13 @@ HYPERVISOR_callback_op(int cmd, void *arg)
return _hypercall2(int, callback_op, cmd, arg);
}
-static inline int
+static __always_inline int
HYPERVISOR_set_debugreg(int reg, unsigned long value)
{
return _hypercall2(int, set_debugreg, reg, value);
}
-static inline unsigned long
+static __always_inline unsigned long
HYPERVISOR_get_debugreg(int reg)
{
return _hypercall1(unsigned long, get_debugreg, reg);
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 4957f59deb40..1bf2ad34188a 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -57,6 +57,14 @@ static inline bool __init xen_x2apic_para_available(void)
}
#endif
+struct pci_dev;
+
+#ifdef CONFIG_XEN_PV_DOM0
+bool xen_initdom_restore_msi(struct pci_dev *dev);
+#else
+static inline bool xen_initdom_restore_msi(struct pci_dev *dev) { return true; }
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
@@ -64,6 +72,7 @@ void xen_arch_unregister_cpu(int num);
#ifdef CONFIG_PVH
void __init xen_pvh_init(struct boot_params *boot_params);
+void __init mem_map_via_hcall(struct boot_params *boot_params_p);
#endif
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a162e559753..e989bc2269f5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -96,11 +96,7 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
asm volatile("1: mov %[val], %[ptr]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [ptr] "=m" (*addr)
: [val] "r" (val));
@@ -110,16 +106,12 @@ static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
static inline int xen_safe_read_ulong(const unsigned long *addr,
unsigned long *val)
{
- int ret = 0;
unsigned long rval = ~0ul;
+ int ret = 0;
asm volatile("1: mov %[ptr], %[rval]\n"
"2:\n"
- ".section .fixup, \"ax\"\n"
- "3: sub $1, %[ret]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[ret])
: [ret] "+r" (ret), [rval] "+r" (rval)
: [ptr] "m" (*addr));
*val = rval;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a776a08f78c..bf6e96011dfe 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -373,9 +373,23 @@ struct kvm_debugregs {
__u64 reserved[9];
};
-/* for KVM_CAP_XSAVE */
+/* for KVM_CAP_XSAVE and KVM_CAP_XSAVE2 */
struct kvm_xsave {
+ /*
+ * KVM_GET_XSAVE2 and KVM_SET_XSAVE write and read as many bytes
+ * as are returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * respectively, when invoked on the vm file descriptor.
+ *
+ * The size value returned by KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)
+ * will always be at least 4096. Currently, it is only greater
+ * than 4096 if a dynamic feature has been enabled with
+ * ``arch_prctl()``, but this may change in the future.
+ *
+ * The offsets of the state save areas in struct kvm_xsave follow
+ * the contents of CPUID leaf 0xD on the host.
+ */
__u32 region[1024];
+ __u32 extra[0];
};
#define KVM_MAX_XCRS 16
@@ -438,6 +452,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP 0
+
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 754a07856817..500b96e71f18 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -2,20 +2,22 @@
#ifndef _ASM_X86_PRCTL_H
#define _ASM_X86_PRCTL_H
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
-#define ARCH_GET_CPUID 0x1011
-#define ARCH_SET_CPUID 0x1012
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
-#define ARCH_GET_XCOMP_SUPP 0x1021
-#define ARCH_GET_XCOMP_PERM 0x1022
-#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ARCH_GET_XCOMP_SUPP 0x1021
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
+#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
-#define ARCH_MAP_VDSO_X32 0x2001
-#define ARCH_MAP_VDSO_32 0x2002
-#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
obj-$(CONFIG_INTEL_TXT) += tboot.o
obj-$(CONFIG_ISA_DMA_API) += i8237.o
-obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-y += stacktrace.o
obj-y += cpu/
obj-y += acpi/
obj-y += reboot.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3f85fcae450c..1e97f944b47d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -139,8 +139,10 @@ static int __init acpi_sleep_setup(char *str)
if (strncmp(str, "s3_beep", 7) == 0)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
+ if (strncmp(str, "s4_hwsig", 8) == 0)
+ acpi_check_s4_hw_signature(1);
if (strncmp(str, "s4_nohwsig", 10) == 0)
- acpi_no_s4_hw_signature();
+ acpi_check_s4_hw_signature(0);
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index daf88f8143c5..cf69081073b5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -60,7 +60,7 @@ save_registers:
popl saved_context_eflags
movl $ret_point, saved_eip
- ret
+ RET
restore_registers:
@@ -70,7 +70,7 @@ restore_registers:
movl saved_context_edi, %edi
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_suspend_lowlevel)
call save_processor_state
@@ -86,7 +86,7 @@ SYM_CODE_START(do_suspend_lowlevel)
ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_suspend_lowlevel)
.data
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 23fb4d51a5da..5007c3ffe96f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -714,7 +714,7 @@ asm (
" .type int3_magic, @function\n"
"int3_magic:\n"
" movl $1, (%" _ASM_ARG1 ")\n"
-" ret\n"
+ ASM_RET
" .size int3_magic, .-int3_magic\n"
" .popsection\n"
);
@@ -1113,10 +1113,13 @@ void text_poke_sync(void)
}
struct text_poke_loc {
- s32 rel_addr; /* addr := _stext + rel_addr */
- s32 rel32;
+ /* addr := _stext + rel_addr */
+ s32 rel_addr;
+ s32 disp;
+ u8 len;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ /* see text_poke_bp_batch() */
u8 old;
};
@@ -1131,7 +1134,8 @@ static struct bp_patching_desc *bp_desc;
static __always_inline
struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
- struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
+ /* rcu_dereference */
+ struct bp_patching_desc *desc = __READ_ONCE(*descp);
if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
@@ -1165,7 +1169,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
- int len, ret = 0;
+ int ret = 0;
void *ip;
if (user_mode(regs))
@@ -1205,8 +1209,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
goto out_put;
}
- len = text_opcode_size(tp->opcode);
- ip += len;
+ ip += tp->len;
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1221,12 +1224,12 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
break;
case CALL_INSN_OPCODE:
- int3_emulate_call(regs, (long)ip + tp->rel32);
+ int3_emulate_call(regs, (long)ip + tp->disp);
break;
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- int3_emulate_jmp(regs, (long)ip + tp->rel32);
+ int3_emulate_jmp(regs, (long)ip + tp->disp);
break;
default:
@@ -1301,7 +1304,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
- int len = text_opcode_size(tp[i].opcode);
+ int len = tp[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
@@ -1378,21 +1381,37 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
- int ret;
+ int ret, i;
memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
ret = insn_decode_kernel(&insn, emulate);
-
BUG_ON(ret < 0);
- BUG_ON(len != insn.length);
tp->rel_addr = addr - (void *)_stext;
+ tp->len = len;
tp->opcode = insn.opcode.bytes[0];
switch (tp->opcode) {
+ case RET_INSN_OPCODE:
+ case JMP32_INSN_OPCODE:
+ case JMP8_INSN_OPCODE:
+ /*
+ * Control flow instructions without implied execution of the
+ * next instruction can be padded with INT3.
+ */
+ for (i = insn.length; i < len; i++)
+ BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
+ break;
+
+ default:
+ BUG_ON(len != insn.length);
+ };
+
+
+ switch (tp->opcode) {
case INT3_INSN_OPCODE:
case RET_INSN_OPCODE:
break;
@@ -1400,7 +1419,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
- tp->rel32 = insn.immediate.value;
+ tp->disp = insn.immediate.value;
break;
default: /* assume NOP */
@@ -1408,13 +1427,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case 2: /* NOP2 -- emulate as JMP8+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP8_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP32_INSN_OPCODE;
- tp->rel32 = 0;
+ tp->disp = 0;
break;
default: /* unknown instruction */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c92c9c774c0e..020c906f7934 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -19,17 +19,19 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
+#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
-/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
static u32 *flush_words;
@@ -39,6 +41,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{}
};
@@ -61,6 +64,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
@@ -78,6 +82,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
@@ -182,53 +187,6 @@ int amd_smn_write(u16 node, u32 address, u32 value)
}
EXPORT_SYMBOL_GPL(amd_smn_write);
-/*
- * Data Fabric Indirect Access uses FICAA/FICAD.
- *
- * Fabric Indirect Configuration Access Address (FICAA): Constructed based
- * on the device's Instance Id and the PCI function and register offset of
- * the desired register.
- *
- * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
- * and FICAD HI registers but so far we only need the LO register.
- */
-int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
-{
- struct pci_dev *F4;
- u32 ficaa;
- int err = -ENODEV;
-
- if (node >= amd_northbridges.num)
- goto out;
-
- F4 = node_to_amd_nb(node)->link;
- if (!F4)
- goto out;
-
- ficaa = 1;
- ficaa |= reg & 0x3FC;
- ficaa |= (func & 0x7) << 11;
- ficaa |= instance_id << 16;
-
- mutex_lock(&smn_mutex);
-
- err = pci_write_config_dword(F4, 0x5C, ficaa);
- if (err) {
- pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
- goto out_unlock;
- }
-
- err = pci_read_config_dword(F4, 0x98, lo);
- if (err)
- pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
-
-out_unlock:
- mutex_unlock(&smn_mutex);
-
-out:
- return err;
-}
-EXPORT_SYMBOL_GPL(amd_df_indirect_read);
int amd_cache_northbridges(void)
{
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index dbacb9ec8843..7517eb05bdc1 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -19,6 +19,7 @@
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/irq_remapping.h>
+#include <asm/xen/hypervisor.h>
struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
@@ -159,11 +160,8 @@ static struct irq_chip pci_msi_controller = {
int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
msi_alloc_info_t *arg)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct msi_desc *desc = first_pci_msi_entry(pdev);
-
init_irq_alloc_info(arg, NULL);
- if (desc->msi_attrib.is_msix) {
+ if (to_pci_dev(dev)->msix_enabled) {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
} else {
arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
@@ -345,3 +343,8 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
}
#endif
+
+bool arch_restore_msi_irqs(struct pci_dev *dev)
+{
+ return xen_initdom_restore_msi(dev);
+}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index c132daabe615..3e6f6b448f6a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void)
void __init lapic_assign_system_vectors(void)
{
- unsigned int i, vector = 0;
+ unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1)
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index ecd3fd6993d1..9fb0a2f8b62a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -38,9 +38,6 @@ static void __used common(void)
#endif
BLANK();
- OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
-
- BLANK();
OFFSET(pbe_address, pbe, address);
OFFSET(pbe_orig_address, pbe, orig_address);
OFFSET(pbe_next, pbe, next);
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
index 03bb2f343ddb..6a6ffcd978f6 100644
--- a/arch/x86/kernel/cc_platform.c
+++ b/arch/x86/kernel/cc_platform.c
@@ -11,6 +11,7 @@
#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
+#include <asm/mshyperv.h>
#include <asm/processor.h>
static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
@@ -50,6 +51,14 @@ static bool amd_cc_platform_has(enum cc_attr attr)
case CC_ATTR_GUEST_STATE_ENCRYPT:
return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+ /*
+ * With SEV, the rep string I/O instructions need to be unrolled
+ * but SEV-ES supports them through the #VC handler.
+ */
+ case CC_ATTR_GUEST_UNROLL_STRING_IO:
+ return (sev_status & MSR_AMD64_SEV_ENABLED) &&
+ !(sev_status & MSR_AMD64_SEV_ES_ENABLED);
+
default:
return false;
}
@@ -58,12 +67,19 @@ static bool amd_cc_platform_has(enum cc_attr attr)
#endif
}
+static bool hyperv_cc_platform_has(enum cc_attr attr)
+{
+ return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
+}
bool cc_platform_has(enum cc_attr attr)
{
if (sme_me_mask)
return amd_cc_platform_has(attr);
+ if (hv_is_isolation_supported())
+ return hyperv_cc_platform_has(attr);
+
return false;
}
EXPORT_SYMBOL_GPL(cc_platform_has);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0083464de5e3..7b8382c11788 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -384,7 +384,7 @@ set_register:
}
EXPORT_SYMBOL(native_write_cr0);
-void native_write_cr4(unsigned long val)
+void __no_profile native_write_cr4(unsigned long val)
{
unsigned long bits_changed = 0;
@@ -1787,6 +1787,17 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+static void wrmsrl_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrl(MSR_CSTAR, val);
+}
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -1794,7 +1805,7 @@ void syscall_init(void)
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1806,7 +1817,7 @@ void syscall_init(void)
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
- wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
+ wrmsrl_cstar((unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index f4dd73396f28..fbaf12e43f41 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -16,6 +16,7 @@
#include <linux/syscore_ops.h>
#include <linux/pm.h>
+#include <asm/cpu_device_id.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -58,6 +59,22 @@ static DEFINE_PER_CPU(u8, saved_epb);
#define EPB_SAVED 0x10ULL
#define MAX_EPB EPB_MASK
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
static int intel_epb_save(void)
{
u64 epb;
@@ -90,7 +107,7 @@ static void intel_epb_restore(void)
*/
val = epb & EPB_MASK;
if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
- val = ENERGY_PERF_BIAS_NORMAL;
+ val = energ_perf_values[EPB_INDEX_NORMAL];
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
@@ -103,18 +120,11 @@ static struct syscore_ops intel_epb_syscore_ops = {
};
static const char * const energy_perf_strings[] = {
- "performance",
- "balance-performance",
- "normal",
- "balance-power",
- "power"
-};
-static const u8 energ_perf_values[] = {
- ENERGY_PERF_BIAS_PERFORMANCE,
- ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
- ENERGY_PERF_BIAS_NORMAL,
- ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
- ENERGY_PERF_BIAS_POWERSAVE
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
};
static ssize_t energy_perf_bias_show(struct device *dev,
@@ -193,13 +203,22 @@ static int intel_epb_offline(unsigned int cpu)
return 0;
}
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ {}
+};
+
static __init int intel_epb_init(void)
{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENODEV;
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
"x86/intel/epb:online", intel_epb_online,
intel_epb_offline);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index fc85eb17cb6d..9f4b508886dd 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -71,6 +71,22 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
@@ -95,11 +111,18 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
[SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
[SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_NBIF] = { "nbif", "NBIF Unit" },
+ [SMCA_SHUB] = { "shub", "System Hub Unit" },
+ [SMCA_SATA] = { "sata", "SATA Unit" },
+ [SMCA_USB] = { "usb", "USB Unit" },
+ [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
+ [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -119,14 +142,14 @@ const char *smca_get_long_name(enum smca_bank_types t)
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
-enum smca_bank_types smca_get_bank_type(unsigned int bank)
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
struct smca_bank *b;
if (bank >= MAX_NR_BANKS)
return N_SMCA_BANK_TYPES;
- b = &smca_banks[bank];
+ b = &per_cpu(smca_banks, cpu)[bank];
if (!b->hwid)
return N_SMCA_BANK_TYPES;
@@ -134,7 +157,7 @@ enum smca_bank_types smca_get_bank_type(unsigned int bank)
}
EXPORT_SYMBOL_GPL(smca_get_bank_type);
-static struct smca_hwid smca_hwid_mcatypes[] = {
+static const struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype } */
/* Reserved type */
@@ -174,6 +197,9 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Microprocessor 5 Unit MCA type */
{ SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
/* Northbridge IO Unit MCA type */
{ SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
@@ -181,19 +207,17 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
{ SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
- /* xGMI PCS MCA type */
{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
-
- /* xGMI PHY MCA type */
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
-
- /* WAFL PHY MCA type */
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
};
-struct smca_bank smca_banks[MAX_NR_BANKS];
-EXPORT_SYMBOL_GPL(smca_banks);
-
/*
* In SMCA enabled processors, we can have multiple banks for a given IP type.
* So to define a unique name for each bank, we use a temp c-string to append
@@ -249,8 +273,9 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
static void smca_configure(unsigned int bank, unsigned int cpu)
{
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
unsigned int i, hwid_mcatype;
- struct smca_hwid *s_hwid;
u32 high, low;
u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
@@ -286,10 +311,6 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
smca_set_misc_banks_map(bank, cpu);
- /* Return early if this bank was already initialized. */
- if (smca_banks[bank].hwid && smca_banks[bank].hwid->hwid_mcatype != 0)
- return;
-
if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
@@ -300,10 +321,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
s_hwid = &smca_hwid_mcatypes[i];
+
if (hwid_mcatype == s_hwid->hwid_mcatype) {
- smca_banks[bank].hwid = s_hwid;
- smca_banks[bank].id = low;
- smca_banks[bank].sysfs_id = s_hwid->count++;
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
break;
}
}
@@ -401,7 +423,7 @@ static void threshold_restart_bank(void *_tr)
u32 hi, lo;
/* sysfs write might race against an offline operation */
- if (this_cpu_read(threshold_banks))
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
return;
rdmsr(tr->b->address, lo, hi);
@@ -589,7 +611,7 @@ out:
bool amd_filter_mce(struct mce *m)
{
- enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
/* See Family 17h Models 10h-2Fh Erratum #1114. */
@@ -627,7 +649,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
} else if (c->x86 == 0x17 &&
(c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
- if (smca_get_bank_type(bank) != SMCA_IF)
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
return;
msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
@@ -689,213 +711,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
-int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
-{
- u64 dram_base_addr, dram_limit_addr, dram_hole_base;
- /* We start from the normalized address */
- u64 ret_addr = norm_addr;
-
- u32 tmp;
-
- u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
- u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
- u8 intlv_addr_sel, intlv_addr_bit;
- u8 num_intlv_bits, hashed_bit;
- u8 lgcy_mmio_hole_en, base = 0;
- u8 cs_mask, cs_id = 0;
- bool hash_enabled = false;
-
- /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
- if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
- goto out_err;
-
- /* Remove HiAddrOffset from normalized address, if enabled: */
- if (tmp & BIT(0)) {
- u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
-
- if (norm_addr >= hi_addr_offset) {
- ret_addr -= hi_addr_offset;
- base = 1;
- }
- }
-
- /* Read D18F0x110 (DramBaseAddress). */
- if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
- goto out_err;
-
- /* Check if address range is valid. */
- if (!(tmp & BIT(0))) {
- pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
- __func__, tmp);
- goto out_err;
- }
-
- lgcy_mmio_hole_en = tmp & BIT(1);
- intlv_num_chan = (tmp >> 4) & 0xF;
- intlv_addr_sel = (tmp >> 8) & 0x7;
- dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
-
- /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
- if (intlv_addr_sel > 3) {
- pr_err("%s: Invalid interleave address select %d.\n",
- __func__, intlv_addr_sel);
- goto out_err;
- }
-
- /* Read D18F0x114 (DramLimitAddress). */
- if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
- goto out_err;
-
- intlv_num_sockets = (tmp >> 8) & 0x1;
- intlv_num_dies = (tmp >> 10) & 0x3;
- dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
-
- intlv_addr_bit = intlv_addr_sel + 8;
-
- /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
- switch (intlv_num_chan) {
- case 0: intlv_num_chan = 0; break;
- case 1: intlv_num_chan = 1; break;
- case 3: intlv_num_chan = 2; break;
- case 5: intlv_num_chan = 3; break;
- case 7: intlv_num_chan = 4; break;
-
- case 8: intlv_num_chan = 1;
- hash_enabled = true;
- break;
- default:
- pr_err("%s: Invalid number of interleaved channels %d.\n",
- __func__, intlv_num_chan);
- goto out_err;
- }
-
- num_intlv_bits = intlv_num_chan;
-
- if (intlv_num_dies > 2) {
- pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
- __func__, intlv_num_dies);
- goto out_err;
- }
-
- num_intlv_bits += intlv_num_dies;
-
- /* Add a bit if sockets are interleaved. */
- num_intlv_bits += intlv_num_sockets;
-
- /* Assert num_intlv_bits <= 4 */
- if (num_intlv_bits > 4) {
- pr_err("%s: Invalid interleave bits %d.\n",
- __func__, num_intlv_bits);
- goto out_err;
- }
-
- if (num_intlv_bits > 0) {
- u64 temp_addr_x, temp_addr_i, temp_addr_y;
- u8 die_id_bit, sock_id_bit, cs_fabric_id;
-
- /*
- * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
- * This is the fabric id for this coherent slave. Use
- * umc/channel# as instance id of the coherent slave
- * for FICAA.
- */
- if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
- goto out_err;
-
- cs_fabric_id = (tmp >> 8) & 0xFF;
- die_id_bit = 0;
-
- /* If interleaved over more than 1 channel: */
- if (intlv_num_chan) {
- die_id_bit = intlv_num_chan;
- cs_mask = (1 << die_id_bit) - 1;
- cs_id = cs_fabric_id & cs_mask;
- }
-
- sock_id_bit = die_id_bit;
-
- /* Read D18F1x208 (SystemFabricIdMask). */
- if (intlv_num_dies || intlv_num_sockets)
- if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
- goto out_err;
-
- /* If interleaved over more than 1 die. */
- if (intlv_num_dies) {
- sock_id_bit = die_id_bit + intlv_num_dies;
- die_id_shift = (tmp >> 24) & 0xF;
- die_id_mask = (tmp >> 8) & 0xFF;
-
- cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
- }
-
- /* If interleaved over more than 1 socket. */
- if (intlv_num_sockets) {
- socket_id_shift = (tmp >> 28) & 0xF;
- socket_id_mask = (tmp >> 16) & 0xFF;
-
- cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
- }
-
- /*
- * The pre-interleaved address consists of XXXXXXIIIYYYYY
- * where III is the ID for this CS, and XXXXXXYYYYY are the
- * address bits from the post-interleaved address.
- * "num_intlv_bits" has been calculated to tell us how many "I"
- * bits there are. "intlv_addr_bit" tells us how many "Y" bits
- * there are (where "I" starts).
- */
- temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
- temp_addr_i = (cs_id << intlv_addr_bit);
- temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
- ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
- }
-
- /* Add dram base address */
- ret_addr += dram_base_addr;
-
- /* If legacy MMIO hole enabled */
- if (lgcy_mmio_hole_en) {
- if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
- goto out_err;
-
- dram_hole_base = tmp & GENMASK(31, 24);
- if (ret_addr >= dram_hole_base)
- ret_addr += (BIT_ULL(32) - dram_hole_base);
- }
-
- if (hash_enabled) {
- /* Save some parentheses and grab ls-bit at the end. */
- hashed_bit = (ret_addr >> 12) ^
- (ret_addr >> 18) ^
- (ret_addr >> 21) ^
- (ret_addr >> 30) ^
- cs_id;
-
- hashed_bit &= BIT(0);
-
- if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
- ret_addr ^= BIT(intlv_addr_bit);
- }
-
- /* Is calculated system address is above DRAM limit address? */
- if (ret_addr > dram_limit_addr)
- goto out_err;
-
- *sys_addr = ret_addr;
- return 0;
-
-out_err:
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
-
bool amd_mce_is_memory_error(struct mce *m)
{
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
if (mce_flags.smca)
- return smca_get_bank_type(m->bank) == SMCA_UMC && xec == 0x0;
+ return smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC && xec == 0x0;
return m->bank == 4 && xec == 0x8;
}
@@ -1211,7 +1033,7 @@ static struct kobj_type threshold_ktype = {
.release = threshold_block_release,
};
-static const char *get_name(unsigned int bank, struct threshold_block *b)
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
{
enum smca_bank_types bank_type;
@@ -1222,7 +1044,7 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- bank_type = smca_get_bank_type(bank);
+ bank_type = smca_get_bank_type(cpu, bank);
if (bank_type >= N_SMCA_BANK_TYPES)
return NULL;
@@ -1232,12 +1054,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}
- if (smca_banks[bank].hwid->count == 1)
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
return smca_get_name(bank_type);
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_get_name(bank_type),
- smca_banks[bank].sysfs_id);
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
return buf_mcatype;
}
@@ -1293,7 +1115,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb
else
tb->blocks = b;
- err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
if (err)
goto out_free;
recurse:
@@ -1348,7 +1170,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
struct device *dev = this_cpu_read(mce_device);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = get_name(bank, NULL);
+ const char *name = get_name(cpu, bank, NULL);
int err = 0;
if (!dev)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6ed365337a3b..5818b837fd4d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -99,7 +99,6 @@ struct mca_config mca_cfg __read_mostly = {
static DEFINE_PER_CPU(struct mce, mces_seen);
static unsigned long mce_need_notify;
-static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -128,7 +127,7 @@ static struct irq_work mce_irq_work;
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
-noinstr void mce_setup(struct mce *m)
+void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -267,11 +266,17 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(const char *msg, struct mce *final, char *exp)
+static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
{
- int apei_err = 0;
struct llist_node *pending;
struct mce_evt_llist *l;
+ int apei_err = 0;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
if (!fake_panic) {
/*
@@ -286,7 +291,7 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
} else {
/* Don't log too much for fake panic */
if (atomic_inc_return(&mce_fake_panicked) > 1)
- return;
+ goto out;
}
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
@@ -314,8 +319,6 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
if (!apei_err)
apei_err = apei_write_mce(final);
}
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
@@ -324,6 +327,9 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
}
/* Support code for software error injection */
@@ -365,7 +371,7 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
}
/* MSR access wrappers used for error injection */
-static noinstr u64 mce_rdmsrl(u32 msr)
+noinstr u64 mce_rdmsrl(u32 msr)
{
DECLARE_ARGS(val, low, high);
@@ -433,9 +439,15 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
{
+ /*
+ * Enable instrumentation around mce_setup() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
mce_setup(m);
+ instrumentation_end();
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
@@ -636,7 +648,7 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
@@ -871,8 +883,13 @@ static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t, const char *msg)
+static noinstr int mce_timed_out(u64 *t, const char *msg)
{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
/*
* The others already did panic for some reason.
* Bail out like in a timeout.
@@ -891,13 +908,17 @@ static int mce_timed_out(u64 *t, const char *msg)
cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
}
- cpu_missing = 1;
- return 1;
+ ret = 1;
+ goto out;
}
*t -= SPINUNIT;
+
out:
touch_nmi_watchdog();
- return 0;
+
+ instrumentation_end();
+
+ return ret;
}
/*
@@ -986,14 +1007,13 @@ static atomic_t global_nwo;
* in the entry order.
* TBD double check parallel CPU hotunplug
*/
-static int mce_start(int *no_way_out)
+static noinstr int mce_start(int *no_way_out)
{
- int order;
- int cpus = num_online_cpus();
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
if (!timeout)
- return -1;
+ return ret;
atomic_add(*no_way_out, &global_nwo);
/*
@@ -1003,14 +1023,17 @@ static int mce_start(int *no_way_out)
order = atomic_inc_return(&mce_callin);
cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
/*
* Wait for everyone.
*/
- while (atomic_read(&mce_callin) != cpus) {
+ while (atomic_read(&mce_callin) != num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Not all CPUs entered broadcast exception handler")) {
atomic_set(&global_nwo, 0);
- return -1;
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1036,7 +1059,7 @@ static int mce_start(int *no_way_out)
if (mce_timed_out(&timeout,
"Timeout: Subject CPUs unable to finish machine check processing")) {
atomic_set(&global_nwo, 0);
- return -1;
+ goto out;
}
ndelay(SPINUNIT);
}
@@ -1047,17 +1070,25 @@ static int mce_start(int *no_way_out)
*/
*no_way_out = atomic_read(&global_nwo);
- return order;
+ ret = order;
+
+out:
+ instrumentation_end();
+
+ return ret;
}
/*
* Synchronize between CPUs after main scanning loop.
* This invokes the bulk of the Monarch processing.
*/
-static int mce_end(int order)
+static noinstr int mce_end(int order)
{
- int ret = -1;
u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
if (!timeout)
goto reset;
@@ -1070,14 +1101,11 @@ static int mce_end(int order)
atomic_inc(&mce_executing);
if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
/*
* Monarch: Wait for everyone to go through their scanning
* loops.
*/
- while (atomic_read(&mce_executing) <= cpus) {
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
if (mce_timed_out(&timeout,
"Timeout: Monarch CPU unable to finish machine check processing"))
goto reset;
@@ -1101,7 +1129,8 @@ static int mce_end(int order)
/*
* Don't reset anything. That's done by the Monarch.
*/
- return 0;
+ ret = 0;
+ goto out;
}
/*
@@ -1117,6 +1146,10 @@ reset:
* Let others run again.
*/
atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
return ret;
}
@@ -1165,13 +1198,14 @@ static noinstr bool mce_check_crashing_cpu(void)
return false;
}
-static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks,
- int no_way_out, int *worst)
+static __always_inline int
+__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
+ unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
+ int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
- int severity, i;
+ int severity, i, taint = 0;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
__clear_bit(i, toclear);
@@ -1198,7 +1232,7 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
continue;
/* Set taint even when machine check was not enabled. */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ taint++;
severity = mce_severity(m, regs, cfg->tolerant, NULL, true);
@@ -1221,7 +1255,13 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* assuming valid severity level != 0 */
m->severity = severity;
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
mce_log(m);
+ instrumentation_end();
if (severity > *worst) {
*final = *m;
@@ -1231,6 +1271,8 @@ static void __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *fin
/* mce_clear_state will clear *final, save locally for use later */
*m = *final;
+
+ return taint;
}
static void kill_me_now(struct callback_head *ch)
@@ -1320,11 +1362,11 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs)
}
/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
*
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
* think about putting a printk in there!
*
* On Intel systems this is entered on all CPUs in parallel through
@@ -1336,12 +1378,20 @@ static noinstr void unexpected_machine_check(struct pt_regs *regs)
* issues: if the machine check was due to a failure of the memory
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
*/
noinstr void do_machine_check(struct pt_regs *regs)
{
- int worst = 0, order, no_way_out, kill_current_task, lmce;
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
char *msg = NULL;
@@ -1385,7 +1435,6 @@ noinstr void do_machine_check(struct pt_regs *regs)
final = this_cpu_ptr(&mces_seen);
*final = m;
- memset(valid_banks, 0, sizeof(valid_banks));
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
@@ -1419,7 +1468,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
order = mce_start(&no_way_out);
}
- __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1451,6 +1500,16 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
}
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
if (worst != MCE_AR_SEVERITY && !kill_current_task)
goto out;
@@ -1482,7 +1541,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (m.kflags & MCE_IN_KERNEL_COPYIN)
queue_task_work(&m, msg, kill_me_never);
}
+
out:
+ instrumentation_end();
+
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -2702,7 +2764,6 @@ struct dentry *mce_get_debugfs_dir(void)
static void mce_reset(void)
{
- cpu_missing = 0;
atomic_set(&mce_fake_panicked, 0);
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 0bfc14041bbb..5fbd7ffb3233 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -74,7 +74,6 @@ MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
MCE_INJECT_SET(synd);
-MCE_INJECT_SET(ipid);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -95,6 +94,20 @@ DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
static void setup_inj_struct(struct mce *m)
@@ -350,7 +363,7 @@ static ssize_t flags_write(struct file *filp, const char __user *ubuf,
char buf[MAX_FLAG_OPT_SIZE], *__buf;
int err;
- if (cnt > MAX_FLAG_OPT_SIZE)
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
@@ -490,6 +503,8 @@ static void do_inject(void)
i_mce.tsc = rdtsc_ordered();
+ i_mce.status |= MCI_STATUS_VAL;
+
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
@@ -577,6 +592,33 @@ static int inj_bank_set(void *data, u64 val)
}
m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
do_inject();
/* Reset injection struct */
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index bb9a46a804bf..baafbb37be67 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ICELAKE_X:
+ case INTEL_FAM6_ICELAKE_D:
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index acd61c41846c..52c633950b38 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -207,4 +207,6 @@ static inline void pentium_machine_check(struct pt_regs *regs) {}
static inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
+noinstr u64 mce_rdmsrl(u32 msr);
+
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index bb019a594a2c..7aa2bda93cbb 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -222,6 +222,9 @@ static bool is_copy_from_user(struct pt_regs *regs)
struct insn insn;
int ret;
+ if (!regs)
+ return false;
+
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
return false;
@@ -263,24 +266,36 @@ static bool is_copy_from_user(struct pt_regs *regs)
* distinguish an exception taken in user from from one
* taken in the kernel.
*/
-static int error_context(struct mce *m, struct pt_regs *regs)
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
{
+ int fixup_type;
+ bool copy_user;
+
if ((m->cs & 3) == 3)
return IN_USER;
+
if (!mc_recoverable(m->mcgstatus))
return IN_KERNEL;
- switch (ex_get_fixup_type(m->ip)) {
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ switch (fixup_type) {
case EX_TYPE_UACCESS:
case EX_TYPE_COPY:
- if (!regs || !is_copy_from_user(regs))
+ if (!copy_user)
return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
fallthrough;
+
case EX_TYPE_FAULT_MCE_SAFE:
case EX_TYPE_DEFAULT_MCE_SAFE:
m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+
default:
return IN_KERNEL;
}
@@ -288,8 +303,7 @@ static int error_context(struct mce *m, struct pt_regs *regs)
static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
{
- u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
- u32 low, high;
+ u64 mcx_cfg;
/*
* We need to look at the following bits:
@@ -300,11 +314,10 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
if (!mce_flags.succor)
return MCE_PANIC_SEVERITY;
- if (rdmsr_safe(addr, &low, &high))
- return MCE_PANIC_SEVERITY;
+ mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
- if ((low & MCI_CONFIG_MCAX) &&
+ if ((mcx_cfg & MCI_CONFIG_MCAX) &&
(m->status & MCI_STATUS_TCC) &&
(err_ctx == IN_KERNEL))
return MCE_PANIC_SEVERITY;
@@ -317,8 +330,8 @@ static int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
-static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
- char **msg, bool is_excp)
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
+ char **msg, bool is_excp)
{
enum context ctx = error_context(m, regs);
@@ -370,8 +383,8 @@ static int mce_severity_amd(struct mce *m, struct pt_regs *regs, int tolerant,
return MCE_KEEP_SEVERITY;
}
-static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
- int tolerant, char **msg, bool is_excp)
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs,
+ int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m, regs);
@@ -407,8 +420,8 @@ static int mce_severity_intel(struct mce *m, struct pt_regs *regs,
}
}
-int mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
- bool is_excp)
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, int tolerant, char **msg,
+ bool is_excp)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index ff55df60228f..5a99f993e639 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
+#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -79,7 +80,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
ack_APIC_irq();
set_irq_regs(old_regs);
@@ -329,8 +330,20 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
+#ifdef CONFIG_SWIOTLB
+ swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
+#endif
+ }
+
+#ifdef CONFIG_SWIOTLB
+ /*
+ * Enable swiotlb force mode in Isolation VM to
+ * use swiotlb bounce buffer for dma transaction.
+ */
+ swiotlb_force = SWIOTLB_FORCE;
+#endif
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c9f0f3d63f75..eaf25a234ff5 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -282,7 +282,7 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >>= shift;
+ return chunks >> shift;
}
static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 9b204843b78d..fa04a73daf9c 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,26 +11,8 @@
#include <asm/traps.h>
#include "sgx.h"
-/**
- * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
- *
- * ENCLS has its own (positive value) error codes and also generates
- * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
- * with system error codes as everything percolates back up the stack.
- * Unfortunately (for us), we need to precisely identify each unique
- * error code, e.g. the action taken if EWB fails varies based on the
- * type of fault and on the exact SGX error code, i.e. we can't simply
- * convert all faults to -EFAULT.
- *
- * To make all three error types coexist, we set bit 30 to identify an
- * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
- * between positive (faults and SGX error codes) and negative (system
- * error codes) values.
- */
-#define ENCLS_FAULT_FLAG 0x40000000
-
/* Retrieve the encoded trapnr from the specified return code. */
-#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
/* Issue a WARN() about an ENCLS function. */
#define ENCLS_WARN(r, name) { \
@@ -50,7 +32,7 @@
*/
static inline bool encls_faulted(int ret)
{
- return ret & ENCLS_FAULT_FLAG;
+ return ret & SGX_ENCLS_FAULT_FLAG;
}
/**
@@ -88,11 +70,7 @@ static inline bool encls_failed(int ret)
asm volatile( \
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret) \
: "a"(rax), inputs \
: "memory", "cc"); \
@@ -127,7 +105,7 @@ static inline bool encls_failed(int ret)
*
* Return:
* 0 on success,
- * trapnr with ENCLS_FAULT_FLAG set on fault
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
*/
#define __encls_N(rax, rbx_out, inputs...) \
({ \
@@ -136,11 +114,7 @@ static inline bool encls_failed(int ret)
"1: .byte 0x0f, 0x01, 0xcf;\n\t" \
" xor %%eax,%%eax;\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_FAULT(1b, 3b) \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
: "=a"(ret), "=b"(rbx_out) \
: "a"(rax), inputs \
: "memory"); \
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8471a8b9b48e..4b41efc9e367 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -6,11 +6,13 @@
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/miscdevice.h>
+#include <linux/node.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -20,6 +22,7 @@ struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
static int sgx_nr_epc_sections;
static struct task_struct *ksgxd_tsk;
static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
/*
* These variables are part of the state of the reclaimer, and must be accessed
@@ -60,6 +63,24 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
ret = __eremove(sgx_get_epc_virt_addr(page));
if (!ret) {
/*
@@ -471,6 +492,7 @@ static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
+ page->flags = 0;
spin_unlock(&node->lock);
atomic_long_dec(&sgx_nr_free_pages);
@@ -624,7 +646,12 @@ void sgx_free_epc_page(struct sgx_epc_page *page)
spin_lock(&node->lock);
- list_add_tail(&page->list, &node->free_page_list);
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
spin_unlock(&node->lock);
atomic_long_inc(&sgx_nr_free_pages);
@@ -648,17 +675,102 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
return true;
}
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
/**
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
@@ -670,6 +782,48 @@ static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
((high & GENMASK_ULL(19, 0)) << 32);
}
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
@@ -713,10 +867,16 @@ static bool __init sgx_page_cache_init(void)
if (!node_isset(nid, sgx_numa_mask)) {
spin_lock_init(&sgx_numa_nodes[nid].lock);
INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
}
sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
sgx_nr_epc_sections++;
}
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4628acec0009..0f17def9fe6f 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -26,9 +26,13 @@
/* Pages, which are being tracked by the page reclaimer. */
#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
struct sgx_epc_page {
unsigned int section;
- unsigned int flags;
+ u16 flags;
+ u16 poison;
struct sgx_encl_page *owner;
struct list_head list;
};
@@ -39,6 +43,8 @@ struct sgx_epc_page {
*/
struct sgx_numa_node {
struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
spinlock_t lock;
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ea4fe192189d..53de044e5654 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -351,7 +351,7 @@ unsigned long oops_begin(void)
}
NOKPROBE_SYMBOL(oops_begin);
-void __noreturn rewind_stack_do_exit(int signr);
+void __noreturn rewind_stack_and_make_dead(int signr);
void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
@@ -386,7 +386,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
* reuse the task stack and that existing poisons are invalid.
*/
kasan_unpoison_task_stack(current);
- rewind_stack_do_exit(signr);
+ rewind_stack_and_make_dead(signr);
}
NOKPROBE_SYMBOL(oops_end);
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 1ca3a56fdc2d..bd6dad83c65b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
.stolen_size = gen9_stolen_size,
};
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_I830_IDS(&i830_early_ops),
INTEL_I845G_IDS(&i845_early_ops),
@@ -593,6 +594,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
u16 device;
int i;
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
@@ -705,7 +713,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ 0, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8ea306b1bf8e..8dea01ffc5c1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,6 +99,19 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
+ * Track AVX512 state use because it is known to slow the max clock
+ * speed of the core.
+ */
+static void update_avx_timestamp(struct fpu *fpu)
+{
+
+#define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+ if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
+ fpu->avx512_timestamp = jiffies;
+}
+
+/*
* Save the FPU register state in fpu->fpstate->regs. The register state is
* preserved.
*
@@ -116,13 +129,7 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
os_xsave(fpu->fpstate);
-
- /*
- * AVX512 state is tracked here because its use is
- * known to slow the max clock speed of the core.
- */
- if (fpu->fpstate->regs.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
- fpu->avx512_timestamp = jiffies;
+ update_avx_timestamp(fpu);
return;
}
@@ -199,7 +206,27 @@ void fpu_reset_from_exception_fixup(void)
}
#if IS_ENABLED(CONFIG_KVM)
-static void __fpstate_reset(struct fpstate *fpstate);
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd);
+
+static void fpu_init_guest_permissions(struct fpu_guest *gfpu)
+{
+ struct fpu_state_perm *fpuperm;
+ u64 perm;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ spin_lock_irq(&current->sighand->siglock);
+ fpuperm = &current->group_leader->thread.fpu.guest_perm;
+ perm = fpuperm->__state_perm;
+
+ /* First fpstate allocation locks down permissions. */
+ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED);
+
+ spin_unlock_irq(&current->sighand->siglock);
+
+ gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED;
+}
bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
{
@@ -211,12 +238,18 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu)
if (!fpstate)
return false;
- __fpstate_reset(fpstate);
+ /* Leave xfd to 0 (the reset value defined by spec) */
+ __fpstate_reset(fpstate, 0);
fpstate_init_user(fpstate);
fpstate->is_valloc = true;
fpstate->is_guest = true;
- gfpu->fpstate = fpstate;
+ gfpu->fpstate = fpstate;
+ gfpu->xfeatures = fpu_user_cfg.default_features;
+ gfpu->perm = fpu_user_cfg.default_features;
+ gfpu->uabi_size = fpu_user_cfg.default_size;
+ fpu_init_guest_permissions(gfpu);
+
return true;
}
EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate);
@@ -236,6 +269,64 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
}
EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+/*
+ * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
+ * @guest_fpu: Pointer to the guest FPU container
+ * @xfeatures: Features requested by guest CPUID
+ *
+ * Enable all dynamic xfeatures according to guest perm and requested CPUID.
+ *
+ * Return: 0 on success, error code otherwise
+ */
+int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures)
+{
+ lockdep_assert_preemption_enabled();
+
+ /* Nothing to do if all requested features are already enabled. */
+ xfeatures &= ~guest_fpu->xfeatures;
+ if (!xfeatures)
+ return 0;
+
+ return __xfd_enable_feature(xfeatures, guest_fpu);
+}
+EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
+
+#ifdef CONFIG_X86_64
+void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
+{
+ fpregs_lock();
+ guest_fpu->fpstate->xfd = xfd;
+ if (guest_fpu->fpstate->in_use)
+ xfd_update_state(guest_fpu->fpstate);
+ fpregs_unlock();
+}
+EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
+
+/**
+ * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
+ *
+ * Must be invoked from KVM after a VMEXIT before enabling interrupts when
+ * XFD write emulation is disabled. This is required because the guest can
+ * freely modify XFD and the state at VMEXIT is not guaranteed to be the
+ * same as the state on VMENTER. So software state has to be udpated before
+ * any operation which depends on it can take place.
+ *
+ * Note: It can be invoked unconditionally even when write emulation is
+ * enabled for the price of a then pointless MSR read.
+ */
+void fpu_sync_guest_vmexit_xfd_state(void)
+{
+ struct fpstate *fps = current->thread.fpu.fpstate;
+
+ lockdep_assert_irqs_disabled();
+ if (fpu_state_size_dynamic()) {
+ rdmsrl(MSR_IA32_XFD, fps->xfd);
+ __this_cpu_write(xfd_state, fps->xfd);
+ }
+}
+EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif /* CONFIG_X86_64 */
+
int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
struct fpstate *guest_fps = guest_fpu->fpstate;
@@ -430,26 +521,28 @@ void fpstate_init_user(struct fpstate *fpstate)
fpstate_init_fstate(fpstate);
}
-static void __fpstate_reset(struct fpstate *fpstate)
+static void __fpstate_reset(struct fpstate *fpstate, u64 xfd)
{
/* Initialize sizes and feature masks */
fpstate->size = fpu_kernel_cfg.default_size;
fpstate->user_size = fpu_user_cfg.default_size;
fpstate->xfeatures = fpu_kernel_cfg.default_features;
fpstate->user_xfeatures = fpu_user_cfg.default_features;
- fpstate->xfd = init_fpstate.xfd;
+ fpstate->xfd = xfd;
}
void fpstate_reset(struct fpu *fpu)
{
/* Set the fpstate pointer to the default fpstate */
fpu->fpstate = &fpu->__fpstate;
- __fpstate_reset(fpu->fpstate);
+ __fpstate_reset(fpu->fpstate, init_fpstate.xfd);
/* Initialize the permission related info in fpu */
fpu->perm.__state_perm = fpu_kernel_cfg.default_features;
fpu->perm.__state_size = fpu_kernel_cfg.default_size;
fpu->perm.__user_state_size = fpu_user_cfg.default_size;
+ /* Same defaults for guests */
+ fpu->guest_perm = fpu->perm;
}
static inline void fpu_inherit_perms(struct fpu *dst_fpu)
@@ -460,6 +553,7 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu)
spin_lock_irq(&current->sighand->siglock);
/* Fork also inherits the permissions of the parent */
dst_fpu->perm = src_fpu->perm;
+ dst_fpu->guest_perm = src_fpu->guest_perm;
spin_unlock_irq(&current->sighand->siglock);
}
}
diff --git a/arch/x86/kernel/fpu/legacy.h b/arch/x86/kernel/fpu/legacy.h
index 17c26b164c63..098f367bb8a7 100644
--- a/arch/x86/kernel/fpu/legacy.h
+++ b/arch/x86/kernel/fpu/legacy.h
@@ -35,11 +35,7 @@ static inline void ldmxcsr(u32 mxcsr)
int err; \
asm volatile("1:" #insn "\n\t" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err), output \
: "0"(0), input); \
err; \
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index d5958278eba6..91d4b6de58ab 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
struct fpstate *fpstate)
{
struct xregs_state __user *x = buf;
- struct _fpx_sw_bytes sw_bytes;
+ struct _fpx_sw_bytes sw_bytes = {};
u32 xfeatures;
int err;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d28829403ed0..02b3ddaf4f75 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1500,35 +1500,13 @@ void fpstate_free(struct fpu *fpu)
}
/**
- * fpu_install_fpstate - Update the active fpstate in the FPU
- *
- * @fpu: A struct fpu * pointer
- * @newfps: A struct fpstate * pointer
- *
- * Returns: A null pointer if the last active fpstate is the embedded
- * one or the new fpstate is already installed;
- * otherwise, a pointer to the old fpstate which has to
- * be freed by the caller.
- */
-static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
- struct fpstate *newfps)
-{
- struct fpstate *oldfps = fpu->fpstate;
-
- if (fpu->fpstate == newfps)
- return NULL;
-
- fpu->fpstate = newfps;
- return oldfps != &fpu->__fpstate ? oldfps : NULL;
-}
-
-/**
* fpstate_realloc - Reallocate struct fpstate for the requested new features
*
* @xfeatures: A bitmap of xstate features which extend the enabled features
* of that task
* @ksize: The required size for the kernel buffer
* @usize: The required size for user space buffers
+ * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations
*
* Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
* terminates quickly, vfree()-induced IPIs may be a concern, but tasks
@@ -1537,13 +1515,13 @@ static struct fpstate *fpu_install_fpstate(struct fpu *fpu,
* Returns: 0 on success, -ENOMEM on allocation error.
*/
static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
- unsigned int usize)
+ unsigned int usize, struct fpu_guest *guest_fpu)
{
struct fpu *fpu = &current->thread.fpu;
struct fpstate *curfps, *newfps = NULL;
unsigned int fpsize;
+ bool in_use;
- curfps = fpu->fpstate;
fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
newfps = vzalloc(fpsize);
@@ -1553,28 +1531,56 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
newfps->user_size = usize;
newfps->is_valloc = true;
+ /*
+ * When a guest FPU is supplied, use @guest_fpu->fpstate
+ * as reference independent whether it is in use or not.
+ */
+ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
+
+ /* Determine whether @curfps is the active fpstate */
+ in_use = fpu->fpstate == curfps;
+
+ if (guest_fpu) {
+ newfps->is_guest = true;
+ newfps->is_confidential = curfps->is_confidential;
+ newfps->in_use = curfps->in_use;
+ guest_fpu->xfeatures |= xfeatures;
+ guest_fpu->uabi_size = usize;
+ }
+
fpregs_lock();
/*
- * Ensure that the current state is in the registers before
- * swapping fpstate as that might invalidate it due to layout
- * changes.
+ * If @curfps is in use, ensure that the current state is in the
+ * registers before swapping fpstate as that might invalidate it
+ * due to layout changes.
*/
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
fpregs_restore_userregs();
newfps->xfeatures = curfps->xfeatures | xfeatures;
newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
newfps->xfd = curfps->xfd & ~xfeatures;
- curfps = fpu_install_fpstate(fpu, newfps);
-
/* Do the final updates within the locked region */
xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
- xfd_update_state(newfps);
+ if (guest_fpu) {
+ guest_fpu->fpstate = newfps;
+ /* If curfps is active, update the FPU fpstate pointer */
+ if (in_use)
+ fpu->fpstate = newfps;
+ } else {
+ fpu->fpstate = newfps;
+ }
+
+ if (in_use)
+ xfd_update_state(fpu->fpstate);
fpregs_unlock();
- vfree(curfps);
+ /* Only free valloc'ed state */
+ if (curfps && curfps->is_valloc)
+ vfree(curfps);
+
return 0;
}
@@ -1595,7 +1601,7 @@ static int validate_sigaltstack(unsigned int usize)
return 0;
}
-static int __xstate_request_perm(u64 permitted, u64 requested)
+static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
{
/*
* This deliberately does not exclude !XSAVES as we still might
@@ -1605,9 +1611,10 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
*/
bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES);
struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
unsigned int ksize, usize;
u64 mask;
- int ret;
+ int ret = 0;
/* Check whether fully enabled */
if ((permitted & requested) == requested)
@@ -1621,15 +1628,18 @@ static int __xstate_request_perm(u64 permitted, u64 requested)
mask &= XFEATURE_MASK_USER_SUPPORTED;
usize = xstate_calculate_size(mask, false);
- ret = validate_sigaltstack(usize);
- if (ret)
- return ret;
+ if (!guest) {
+ ret = validate_sigaltstack(usize);
+ if (ret)
+ return ret;
+ }
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
- WRITE_ONCE(fpu->perm.__state_perm, requested);
+ WRITE_ONCE(perm->__state_perm, requested);
/* Protected by sighand lock */
- fpu->perm.__state_size = ksize;
- fpu->perm.__user_state_size = usize;
+ perm->__state_size = ksize;
+ perm->__user_state_size = usize;
return ret;
}
@@ -1640,7 +1650,7 @@ static const u64 xstate_prctl_req[XFEATURE_MAX] = {
[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
};
-static int xstate_request_perm(unsigned long idx)
+static int xstate_request_perm(unsigned long idx, bool guest)
{
u64 permitted, requested;
int ret;
@@ -1661,26 +1671,33 @@ static int xstate_request_perm(unsigned long idx)
return -EOPNOTSUPP;
/* Lockless quick check */
- permitted = xstate_get_host_group_perm();
+ permitted = xstate_get_group_perm(guest);
if ((permitted & requested) == requested)
return 0;
/* Protect against concurrent modifications */
spin_lock_irq(&current->sighand->siglock);
- permitted = xstate_get_host_group_perm();
- ret = __xstate_request_perm(permitted, requested);
+ permitted = xstate_get_group_perm(guest);
+
+ /* First vCPU allocation locks the permissions. */
+ if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
+ ret = -EBUSY;
+ else
+ ret = __xstate_request_perm(permitted, requested, guest);
spin_unlock_irq(&current->sighand->siglock);
return ret;
}
-int xfd_enable_feature(u64 xfd_err)
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
{
u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+ struct fpu_state_perm *perm;
unsigned int ksize, usize;
struct fpu *fpu;
if (!xfd_event) {
- pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
+ if (!guest_fpu)
+ pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
return 0;
}
@@ -1688,14 +1705,16 @@ int xfd_enable_feature(u64 xfd_err)
spin_lock_irq(&current->sighand->siglock);
/* If not permitted let it die */
- if ((xstate_get_host_group_perm() & xfd_event) != xfd_event) {
+ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
spin_unlock_irq(&current->sighand->siglock);
return -EPERM;
}
fpu = &current->group_leader->thread.fpu;
- ksize = fpu->perm.__state_size;
- usize = fpu->perm.__user_state_size;
+ perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
+ ksize = perm->__state_size;
+ usize = perm->__user_state_size;
+
/*
* The feature is permitted. State size is sufficient. Dropping
* the lock is safe here even if more features are added from
@@ -1708,17 +1727,29 @@ int xfd_enable_feature(u64 xfd_err)
* Try to allocate a new fpstate. If that fails there is no way
* out.
*/
- if (fpstate_realloc(xfd_event, ksize, usize))
+ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
return -EFAULT;
return 0;
}
+
+int xfd_enable_feature(u64 xfd_err)
+{
+ return __xfd_enable_feature(xfd_err, NULL);
+}
+
#else /* CONFIG_X86_64 */
-static inline int xstate_request_perm(unsigned long idx)
+static inline int xstate_request_perm(unsigned long idx, bool guest)
{
return -EPERM;
}
#endif /* !CONFIG_X86_64 */
+u64 xstate_get_guest_group_perm(void)
+{
+ return xstate_get_group_perm(true);
+}
+EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
+
/**
* fpu_xstate_prctl - xstate permission operations
* @tsk: Redundant pointer to current
@@ -1742,6 +1773,7 @@ long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
u64 __user *uptr = (u64 __user *)arg2;
u64 permitted, supported;
unsigned long idx = arg2;
+ bool guest = false;
if (tsk != current)
return -EPERM;
@@ -1760,11 +1792,20 @@ long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2)
permitted &= XFEATURE_MASK_USER_SUPPORTED;
return put_user(permitted, uptr);
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ permitted = xstate_get_guest_group_perm();
+ permitted &= XFEATURE_MASK_USER_SUPPORTED;
+ return put_user(permitted, uptr);
+
+ case ARCH_REQ_XCOMP_GUEST_PERM:
+ guest = true;
+ fallthrough;
+
case ARCH_REQ_XCOMP_PERM:
if (!IS_ENABLED(CONFIG_X86_64))
return -EOPNOTSUPP;
- return xstate_request_perm(idx);
+ return xstate_request_perm(idx, guest);
default:
return -EINVAL;
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 86ea7c0fa2f6..d22ace092ca2 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -20,10 +20,19 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask)
xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT;
}
-static inline u64 xstate_get_host_group_perm(void)
+static inline u64 xstate_get_group_perm(bool guest)
{
+ struct fpu *fpu = &current->group_leader->thread.fpu;
+ struct fpu_state_perm *perm;
+
/* Pairs with WRITE_ONCE() in xstate_request_perm() */
- return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm);
+ perm = guest ? &fpu->guest_perm : &fpu->perm;
+ return READ_ONCE(perm->__state_perm);
+}
+
+static inline u64 xstate_get_host_group_perm(void)
+{
+ return xstate_get_group_perm(false);
}
enum xstate_copy_mode {
@@ -108,11 +117,7 @@ static inline u64 xfeatures_mask_independent(void)
"\n" \
"xor %[err], %[err]\n" \
"3:\n" \
- ".pushsection .fixup,\"ax\"\n" \
- "4: movl $-2, %[err]\n" \
- "jmp 3b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(661b, 4b) \
+ _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -149,8 +154,14 @@ static inline void xfd_update_state(struct fpstate *fpstate)
}
}
}
+
+extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
static inline void xfd_update_state(struct fpstate *fpstate) { }
+
+static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
+ return -EPERM;
+}
#endif
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c39f906cdc4e..7cc540e6de0c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -303,7 +303,7 @@ union ftrace_op_code_union {
} __attribute__((packed));
};
-#define RET_SIZE 1
+#define RET_SIZE 1 + IS_ENABLED(CONFIG_SLS)
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index e405fe1a8bf4..a0ed0e4a2c0c 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -19,7 +19,7 @@
#endif
SYM_FUNC_START(__fentry__)
- ret
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -84,7 +84,7 @@ ftrace_graph_call:
/* This is weak to keep gas from relaxing the jumps */
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
- ret
+ RET
SYM_CODE_END(ftrace_caller)
SYM_CODE_START(ftrace_regs_caller)
@@ -177,7 +177,7 @@ SYM_CODE_START(ftrace_graph_caller)
popl %edx
popl %ecx
popl %eax
- ret
+ RET
SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 7a879901f103..11ac028e30e4 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -132,7 +132,7 @@
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
- retq
+ RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -176,11 +176,11 @@ SYM_FUNC_END(ftrace_caller);
SYM_FUNC_START(ftrace_epilogue)
/*
* This is weak to keep gas from relaxing the jumps.
- * It is also used to copy the retq for trampolines.
+ * It is also used to copy the RET for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
UNWIND_HINT_FUNC
- retq
+ RET
SYM_FUNC_END(ftrace_epilogue)
SYM_FUNC_START(ftrace_regs_caller)
@@ -284,7 +284,7 @@ SYM_FUNC_START(__fentry__)
jnz trace
SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL)
- retq
+ RET
trace:
/* save_mcount_regs fills in first two parameters */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index fc5371a7e9d1..de563db9cdcd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -126,6 +126,36 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
+static unsigned long sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+{
+ unsigned long vaddr, vaddr_end;
+ int i;
+
+ /* Encrypt the kernel and related (if SME is active) */
+ sme_encrypt_kernel(bp);
+
+ /*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (sme_get_me_mask()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
+ * Return the SME encryption mask (if SME is active) to be used as a
+ * modifier for the initial pgdir entry programmed into CR3.
+ */
+ return sme_get_me_mask();
+}
+
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
@@ -135,7 +165,6 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
- unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -276,34 +305,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
*/
*fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
- /* Encrypt the kernel and related (if SME is active) */
- sme_encrypt_kernel(bp);
-
- /*
- * Clear the memory encryption mask from the .bss..decrypted section.
- * The bss section will be memset to zero later in the initialization so
- * there is no need to zero it after changing the memory encryption
- * attribute.
- *
- * This is early code, use an open coded check for SME instead of
- * using cc_platform_has(). This eliminates worries about removing
- * instrumentation or checking boot_cpu_data in the cc_platform_has()
- * function.
- */
- if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
- i = pmd_index(vaddr);
- pmd[i] -= sme_get_me_mask();
- }
- }
-
- /*
- * Return the SME encryption mask (if SME is active) to be used as a
- * modifier for the initial pgdir entry programmed into CR3.
- */
- return sme_get_me_mask();
+ return sme_postprocess_startup(bp, pmd);
}
unsigned long __startup_secondary_64(void)
@@ -485,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
+ /*
+ * This needs to happen *before* kasan_early_init() because latter maps stuff
+ * into that page.
+ */
clear_page(init_top_pgt);
/*
@@ -496,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
kasan_early_init();
+ /*
+ * Flush global TLB entries which could be left over from the trampoline page
+ * table.
+ *
+ * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs
+ * instrument native_write_cr4() so KASAN must be initialized for that
+ * instrumentation to work.
+ */
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
+
idt_setup_early_handler();
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d8c64dab0efe..eb8656bac99b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -340,7 +340,7 @@ SYM_FUNC_END(startup_32_smp)
__INIT
setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
- ret
+ RET
SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d8b3ebd2bb85..9c63fc5988cd 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
call sev_verify_cbit
popq %rsi
- /* Switch to new page-table */
+ /*
+ * Switch to new page-table
+ *
+ * For the boot CPU this switches to early_top_pgt which still has the
+ * indentity mappings present. The secondary CPUs will switch to the
+ * init_top_pgt here, away from the trampoline_pgd and unmap the
+ * indentity mapped ranges.
+ */
movq %rax, %cr3
+ /*
+ * Do a global TLB flush after the CR3 switch to make sure the TLB
+ * entries from the identity mapping are flushed.
+ */
+ movq %cr4, %rcx
+ movq %rcx, %rax
+ xorq $X86_CR4_PGE, %rcx
+ movq %rcx, %cr4
+ movq %rax, %cr4
+
/* Ensure I am executing from virtual addresses */
movq $1f, %rax
ANNOTATE_RETPOLINE_SAFE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 882213df3713..71f336425e58 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1435,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_rtc_timer_reinit();
memset(&curr_time, 0, sizeof(struct rtc_time));
- if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- mc146818_get_time(&curr_time);
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
+ if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+ pr_err_ratelimited("unable to read current time from RTC\n");
+ return IRQ_HANDLED;
+ }
+ }
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 760e1f293093..aaf9e776f323 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -11,7 +11,7 @@
SYM_FUNC_START(native_save_fl)
pushf
pop %_ASM_AX
- ret
+ RET
SYM_FUNC_END(native_save_fl)
.popsection
EXPORT_SYMBOL(native_save_fl)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index fce99e249d61..6290712cb36d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1051,7 +1051,7 @@ asm(
" addl $4, %esp\n"
" popfl\n"
#endif
- " ret\n"
+ ASM_RET
".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"
);
NOKPROBE_SYMBOL(__kretprobe_trampoline);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 59abbdad7729..a438217cbfac 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -313,7 +313,7 @@ static void kvm_register_steal_time(void)
return;
wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
- pr_info("stealtime: cpu %d, msr %llx\n", cpu,
+ pr_debug("stealtime: cpu %d, msr %llx\n", cpu,
(unsigned long long) slow_virt_to_phys(st));
}
@@ -350,7 +350,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
- pr_info("setup async PF for cpu %d\n", smp_processor_id());
+ pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
@@ -376,7 +376,7 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
- pr_info("disable async PF for cpu %d\n", smp_processor_id());
+ pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}
static void kvm_disable_steal_time(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 462dd8e9b03d..a35cbf9107af 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -174,7 +174,7 @@ static void kvm_register_clock(char *txt)
pa = slow_virt_to_phys(&src->pvti) | 0x01ULL;
wrmsrl(msr_kvm_system_time, pa);
- pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
+ pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
}
static void kvm_save_sched_clock_state(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 169fb6f4cd2e..95fa745e310a 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ gfp_t gfp_mask = GFP_KERNEL;
void *p;
if (PAGE_ALIGN(size) > MODULES_LEN)
@@ -74,10 +75,10 @@ void *module_alloc(unsigned long size)
p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
- MODULES_END, GFP_KERNEL,
- PAGE_KERNEL, 0, NUMA_NO_NODE,
+ MODULES_END, gfp_mask,
+ PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
__builtin_return_address(0));
- if (p && (kasan_module_alloc(p, size) < 0)) {
+ if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7f7636aac620..4420499f7bb4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,7 +41,7 @@ extern void _paravirt_nop(void);
asm (".pushsection .entry.text, \"ax\"\n"
".global _paravirt_nop\n"
"_paravirt_nop:\n\t"
- "ret\n\t"
+ ASM_RET
".size _paravirt_nop, . - _paravirt_nop\n\t"
".type _paravirt_nop, @function\n\t"
".popsection");
@@ -51,7 +51,7 @@ asm (".pushsection .entry.text, \"ax\"\n"
".global paravirt_ret0\n"
"paravirt_ret0:\n\t"
"xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
- "ret\n\t"
+ ASM_RET
".size paravirt_ret0, . - paravirt_ret0\n\t"
".type paravirt_ret0, @function\n\t"
".popsection");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 04143a653a8a..81d8ef036637 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -365,7 +365,7 @@ void arch_setup_new_exec(void)
clear_thread_flag(TIF_SSBD);
task_clear_spec_ssb_disable(current);
task_clear_spec_ssb_noexec(current);
- speculation_ctrl_update(task_thread_info(current)->flags);
+ speculation_ctrl_update(read_thread_flags());
}
}
@@ -617,7 +617,7 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
}
/* Return the updated threadinfo flags*/
- return task_thread_info(tsk)->flags;
+ return read_task_thread_flags(tsk);
}
void speculation_ctrl_update(unsigned long tif)
@@ -653,8 +653,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
unsigned long tifp, tifn;
- tifn = READ_ONCE(task_thread_info(next_p)->flags);
- tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ tifn = read_task_thread_flags(next_p);
+ tifp = read_task_thread_flags(prev_p);
switch_to_bitmap(tifp);
@@ -993,6 +993,8 @@ long do_arch_prctl_common(struct task_struct *task, int option,
case ARCH_GET_XCOMP_SUPP:
case ARCH_GET_XCOMP_PERM:
case ARCH_REQ_XCOMP_PERM:
+ case ARCH_GET_XCOMP_GUEST_PERM:
+ case ARCH_REQ_XCOMP_GUEST_PERM:
return fpu_xstate_prctl(task, option, arg2);
}
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 1d0797b2338a..76b547b83232 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -13,8 +13,8 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
static inline void switch_to_extra(struct task_struct *prev,
struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
- unsigned long prev_tif = task_thread_info(prev)->flags;
+ unsigned long next_tif = read_task_thread_flags(next);
+ unsigned long prev_tif = read_task_thread_flags(prev);
if (IS_ENABLED(CONFIG_SMP)) {
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 0a40df66a40d..fa700b46588e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type)
spin_unlock(&rtc_lock);
/*
- * Switch back to the initial page table.
+ * Switch to the trampoline page table.
*/
-#ifdef CONFIG_X86_32
- load_cr3(initial_page_table);
-#else
- write_cr3(real_mode_header->trampoline_pgd);
-
- /* Exiting long mode will fail if CR4.PCIDE is set. */
- if (boot_cpu_has(X86_FEATURE_PCID))
- cr4_clear_bits(X86_CR4_PCIDE);
-#endif
+ load_trampoline_pgtable();
/* Jump to the identity-mapped low memory code */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index f469153eca8a..fcc8a7699103 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -91,7 +91,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -159,7 +159,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %edx, %edx
xorl %esi, %esi
xorl %ebp, %ebp
- ret
+ RET
1:
popl %edx
movl CP_PA_SWAP_PAGE(%edi), %esp
@@ -190,7 +190,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movl %edi, %eax
addl $(virtual_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
@@ -208,7 +208,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popl %edi
popl %esi
popl %ebx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -271,7 +271,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
popl %edi
popl %ebx
popl %ebp
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index c8fe74a28143..399f075ccdc4 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -104,7 +104,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
pushq %r8
- ret
+ RET
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
@@ -191,7 +191,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
xorl %r14d, %r14d
xorl %r15d, %r15d
- ret
+ RET
1:
popq %rdx
@@ -210,7 +210,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
call swap_pages
movq $virtual_mapped, %rax
pushq %rax
- ret
+ RET
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
@@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
popq %r12
popq %rbp
popq %rbx
- ret
+ RET
SYM_CODE_END(virtual_mapped)
/* Do the copies */
@@ -288,7 +288,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
lea PAGE_SIZE(%rax), %rsi
jmp 0b
3:
- ret
+ RET
SYM_CODE_END(swap_pages)
.globl kexec_control_code_size
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 9b9fb7882c20..9ae64f9af956 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/dmi.h>
#include <linux/ioport.h>
#include <asm/e820/api.h>
@@ -23,11 +24,31 @@ static void resource_clip(struct resource *res, resource_size_t start,
res->start = end + 1;
}
+/*
+ * Some BIOS-es contain a bug where they add addresses which map to
+ * system RAM in the PCI host bridge window returned by the ACPI _CRS
+ * method, see commit 4dc2287c1805 ("x86: avoid E820 regions when
+ * allocating address space"). To avoid this Linux by default excludes
+ * E820 reservations when allocating addresses since 2010.
+ * In 2019 some systems have shown-up with E820 reservations which cover
+ * the entire _CRS returned PCI host bridge window, causing all attempts
+ * to assign memory to PCI BARs to fail if Linux uses E820 reservations.
+ *
+ * Ideally Linux would fully stop using E820 reservations, but then
+ * the old systems this was added for will regress.
+ * Instead keep the old behavior for old systems, while ignoring the
+ * E820 reservations for any systems from now on.
+ */
static void remove_e820_regions(struct resource *avail)
{
- int i;
+ int i, year = dmi_get_bios_year();
struct e820_entry *entry;
+ if (year >= 2018)
+ return;
+
+ pr_info_once("PCI: Removing E820 reservations from host bridge windows\n");
+
for (i = 0; i < e820_table->nr_entries; i++) {
entry = &e820_table->entries[i];
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c410be738ae7..f7a132eb794d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -40,6 +40,7 @@
#include <asm/kasan.h>
#include <asm/kaslr.h>
#include <asm/mce.h>
+#include <asm/memtype.h>
#include <asm/mtrr.h>
#include <asm/realmode.h>
#include <asm/olpc_ofw.h>
@@ -713,9 +714,6 @@ static void __init early_reserve_memory(void)
early_reserve_initrd();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
-
memblock_x86_reserve_range_setup_data();
reserve_ibft_region();
@@ -742,28 +740,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
-static char *prepare_command_line(void)
-{
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
-
- parse_early_param();
-
- return command_line;
-}
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -853,23 +829,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
/*
- * x86_configure_nx() is called before parse_early_param() (called by
- * prepare_command_line()) to detect whether hardware doesn't support
- * NX (so that the early EHCI debug console setup can safely call
- * set_fixmap()). It may then be called again from within noexec_setup()
- * during parsing early parameters to honor the respective command line
- * option.
- */
- x86_configure_nx();
-
- /*
- * This parses early params and it needs to run before
- * early_reserve_memory() because latter relies on such settings
- * supplied as early params.
- */
- *cmdline_p = prepare_command_line();
-
- /*
* Do some memory reservations *before* memory is added to memblock, so
* memblock allocations won't overwrite it.
*
@@ -902,6 +861,36 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
+ strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ /*
+ * x86_configure_nx() is called before parse_early_param() to detect
+ * whether hardware doesn't support NX (so that the early EHCI debug
+ * console setup can safely call set_fixmap()). It may then be called
+ * again from within noexec_setup() during parsing early parameters
+ * to honor the respective command line option.
+ */
+ x86_configure_nx();
+
+ parse_early_param();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -979,7 +968,11 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- mtrr_bp_init();
+ if (IS_ENABLED(CONFIG_MTRR))
+ mtrr_bp_init();
+ else
+ pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
+
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 7b65275544b2..49325caa7307 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -84,60 +84,6 @@ static bool __init pcpu_need_numa(void)
}
#endif
-/**
- * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
- * @cpu: cpu to allocate for
- * @size: size allocation in bytes
- * @align: alignment
- *
- * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
- * does the right thing for NUMA regardless of the current
- * configuration.
- *
- * RETURNS:
- * Pointer to the allocated area on success, NULL on failure.
- */
-static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
- unsigned long align)
-{
- const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NUMA
- int node = early_cpu_to_node(cpu);
- void *ptr;
-
- if (!node_online(node) || !NODE_DATA(node)) {
- ptr = memblock_alloc_from(size, align, goal);
- pr_info("cpu %d has no node %d or node-local memory\n",
- cpu, node);
- pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
- cpu, size, __pa(ptr));
- } else {
- ptr = memblock_alloc_try_nid(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- node);
-
- pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
- cpu, size, node, __pa(ptr));
- }
- return ptr;
-#else
- return memblock_alloc_from(size, align, goal);
-#endif
-}
-
-/*
- * Helpers for first chunk memory allocation
- */
-static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
-{
- return pcpu_alloc_bootmem(cpu, size, align);
-}
-
-static void __init pcpu_fc_free(void *ptr, size_t size)
-{
- memblock_free(ptr, size);
-}
-
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
#ifdef CONFIG_NUMA
@@ -150,7 +96,12 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
#endif
}
-static void __init pcpup_populate_pte(unsigned long addr)
+static int __init pcpu_cpu_to_node(int cpu)
+{
+ return early_cpu_to_node(cpu);
+}
+
+void __init pcpu_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
@@ -205,15 +156,14 @@ void __init setup_per_cpu_areas(void)
rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
dyn_size, atom_size,
pcpu_cpu_distance,
- pcpu_fc_alloc, pcpu_fc_free);
+ pcpu_cpu_to_node);
if (rc < 0)
pr_warn("%s allocator failed (%d), falling back to page size\n",
pcpu_fc_names[pcpu_chosen_fc], rc);
}
if (rc < 0)
rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
- pcpu_fc_alloc, pcpu_fc_free,
- pcpup_populate_pte);
+ pcpu_cpu_to_node);
if (rc < 0)
panic("cannot initialize percpu area (err=%d)", rc);
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index 787dc5f568b5..ce987688bbc0 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -221,7 +221,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
fail:
/* Terminate the guest */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 74f0ec955384..e6d316a01fdd 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -26,6 +26,7 @@
#include <asm/fpu/xcr.h>
#include <asm/processor.h>
#include <asm/realmode.h>
+#include <asm/setup.h>
#include <asm/traps.h>
#include <asm/svm.h>
#include <asm/smp.h>
@@ -86,9 +87,6 @@ struct ghcb_state {
static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
unsigned long sp = regs->sp;
@@ -209,9 +207,6 @@ static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
return ghcb;
}
-/* Needed in vc_early_forward_exception */
-void do_early_exception(struct pt_regs *regs, int trapnr);
-
static inline u64 sev_es_rd_ghcb_msr(void)
{
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
@@ -294,11 +289,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
char *dst, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
- char __user *target = (char __user *)dst;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __put_user() independent of whether kernel or user
@@ -320,26 +310,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *target = (u8 __user *)dst;
+
memcpy(&d1, buf, 1);
if (__put_user(d1, target))
goto fault;
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *target = (u16 __user *)dst;
+
memcpy(&d2, buf, 2);
if (__put_user(d2, target))
goto fault;
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *target = (u32 __user *)dst;
+
memcpy(&d4, buf, 4);
if (__put_user(d4, target))
goto fault;
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *target = (u64 __user *)dst;
+
memcpy(&d8, buf, 8);
if (__put_user(d8, target))
goto fault;
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -362,11 +368,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
char *src, char *buf, size_t size)
{
unsigned long error_code = X86_PF_PROT;
- char __user *s = (char __user *)src;
- u64 d8;
- u32 d4;
- u16 d2;
- u8 d1;
/*
* This function uses __get_user() independent of whether kernel or user
@@ -388,26 +389,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
* instructions here would cause infinite nesting.
*/
switch (size) {
- case 1:
+ case 1: {
+ u8 d1;
+ u8 __user *s = (u8 __user *)src;
+
if (__get_user(d1, s))
goto fault;
memcpy(buf, &d1, 1);
break;
- case 2:
+ }
+ case 2: {
+ u16 d2;
+ u16 __user *s = (u16 __user *)src;
+
if (__get_user(d2, s))
goto fault;
memcpy(buf, &d2, 2);
break;
- case 4:
+ }
+ case 4: {
+ u32 d4;
+ u32 __user *s = (u32 __user *)src;
+
if (__get_user(d4, s))
goto fault;
memcpy(buf, &d4, 4);
break;
- case 8:
+ }
+ case 8: {
+ u64 d8;
+ u64 __user *s = (u64 __user *)src;
if (__get_user(d8, s))
goto fault;
memcpy(buf, &d8, 8);
break;
+ }
default:
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
return ES_UNSUPPORTED;
@@ -776,22 +792,6 @@ static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
do_early_exception(ctxt->regs, trapnr);
}
-static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-
static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
{
long *reg_array;
@@ -839,76 +839,6 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
}
-static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- unsigned int bytes = 0;
- enum es_result ret;
- int sign_byte;
- long *reg_data;
-
- switch (insn->opcode.bytes[1]) {
- /* MMIO Read w/ zero-extension */
- case 0xb6:
- bytes = 1;
- fallthrough;
- case 0xb7:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- memset(reg_data, 0, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- /* MMIO Read w/ sign-extension */
- case 0xbe:
- bytes = 1;
- fallthrough;
- case 0xbf:
- if (!bytes)
- bytes = 2;
-
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Sign extend based on operand size */
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
- memset(reg_data, sign_byte, insn->opnd_bytes);
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
-
- default:
- ret = ES_UNSUPPORTED;
- }
-
- return ret;
-}
-
/*
* The MOVS instruction has two memory operands, which raises the
* problem that it is not known whether the access to the source or the
@@ -976,83 +906,79 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
return ES_RETRY;
}
-static enum es_result vc_handle_mmio(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
+static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
unsigned int bytes = 0;
+ enum mmio_type mmio;
enum es_result ret;
+ u8 sign_byte;
long *reg_data;
- switch (insn->opcode.bytes[0]) {
- /* MMIO Write */
- case 0x88:
- bytes = 1;
- fallthrough;
- case 0x89:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ mmio = insn_decode_mmio(insn, &bytes);
+ if (mmio == MMIO_DECODE_FAILED)
+ return ES_DECODE_FAILED;
- reg_data = vc_insn_get_reg(ctxt);
+ if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
if (!reg_data)
return ES_DECODE_FAILED;
+ }
+ switch (mmio) {
+ case MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- case 0xc6:
- bytes = 1;
- fallthrough;
- case 0xc7:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_WRITE_IMM:
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
-
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
-
- /* MMIO Read */
- case 0x8a:
- bytes = 1;
- fallthrough;
- case 0x8b:
- if (!bytes)
- bytes = insn->opnd_bytes;
-
+ case MMIO_READ:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
- reg_data = vc_insn_get_reg(ctxt);
- if (!reg_data)
- return ES_DECODE_FAILED;
-
/* Zero-extend for 32-bit operation */
if (bytes == 4)
*reg_data = 0;
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
+ case MMIO_READ_ZERO_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- /* MOVS instruction */
- case 0xa4:
- bytes = 1;
- fallthrough;
- case 0xa5:
- if (!bytes)
- bytes = insn->opnd_bytes;
+ /* Zero extend based on operand size */
+ memset(reg_data, 0, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+ case MMIO_READ_SIGN_EXTEND:
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
- ret = vc_handle_mmio_movs(ctxt, bytes);
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+
+ /* Sign extend based on operand size */
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- /* Two-Byte Opcodes */
- case 0x0f:
- ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
+ case MMIO_MOVS:
+ ret = vc_handle_mmio_movs(ctxt, bytes);
break;
default:
ret = ES_UNSUPPORTED;
+ break;
}
return ret;
@@ -1411,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
show_regs(regs);
/* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
/* If that fails and we get here - just panic */
panic("Returned from Terminate-Request to Hypervisor\n");
@@ -1459,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
/* Do initial setup or terminate the guest */
if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
- sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+ sev_es_terminate(GHCB_SEV_ES_GEN_REQ);
vc_ghcb_invalidate(boot_ghcb);
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index ee04941a6546..3355e27c69eb 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -85,5 +85,5 @@ SYM_FUNC_START(sev_verify_cbit)
#endif
/* Return page-table pointer */
movq %rdi, %rax
- ret
+ RET
SYM_FUNC_END(sev_verify_cbit)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ac2909f0cab3..617012f4619f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -579,6 +579,17 @@ static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
{ NULL, },
};
+static struct sched_domain_topology_level x86_hybrid_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
@@ -1469,8 +1480,11 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
calculate_max_logical_packages();
+ /* XXX for now assume numa-in-package and hybrid don't overlap */
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ set_sched_topology(x86_hybrid_topology);
nmi_selftest();
impress_friends();
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9c407a33a774..531fb4cbb63f 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -17,6 +17,8 @@ enum insn_type {
*/
static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
const void *emulate = NULL;
@@ -42,8 +44,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;
case RET:
- code = text_gen_insn(RET_INSN_OPCODE, insn, func);
- size = RET_INSN_SIZE;
+ code = &retinsn;
break;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 2e076a459a0c..a698196377be 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+static void __init tsc_disable_clocksource_watchdog(void)
+{
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+ /*
+ * Disable the clocksource watchdog when the system has:
+ * - TSC running at constant frequency
+ * - TSC which does not stop in C-States
+ * - the TSC_ADJUST register which allows to detect even minimal
+ * modifications
+ * - not more than two sockets. As the number of sockets cannot be
+ * evaluated at the early boot stage where this has to be
+ * invoked, check the number of online memory nodes as a
+ * fallback solution which is an reasonable estimate.
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ nr_online_nodes <= 2)
+ tsc_disable_clocksource_watchdog();
}
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
- if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
}
if (tsc_clocksource_reliable || no_tsc_watchdog)
- clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+ tsc_disable_clocksource_watchdog();
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 50a4515fe0ad..9452dc9664b5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
};
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
/*
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
}
}
+/*
+ * Normally the tsc_sync will be checked every time system enters idle
+ * state, but there is still caveat that a system won't enter idle,
+ * either because it's too busy or configured purposely to not enter
+ * idle.
+ *
+ * So setup a periodic timer (every 10 minutes) to make sure the check
+ * is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL (HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+ int next_cpu;
+
+ tsc_verify_tsc_adjust(false);
+
+ /* Run the check for all onlined CPUs in turn */
+ next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(cpu_online_mask);
+
+ tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+ add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+ return 0;
+
+ timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+ tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+ add_timer(&tsc_sync_check_timer);
+
+ return 0;
+}
+late_initcall(start_sync_check_timer);
+
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
unsigned int cpu, bool bootcpu)
{
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 641f0fe1e5b4..1258a5872d12 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -132,9 +132,9 @@ SYM_FUNC_START_LOCAL(verify_cpu)
.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
- ret
+ RET
.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
- ret
+ RET
SYM_FUNC_END(verify_cpu)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3d6dc12d198f..27f830345b6f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -137,7 +137,6 @@ SECTIONS
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
STATIC_CALL_TEXT
- *(.fixup)
*(.gnu.warning)
#ifdef CONFIG_RETPOLINE
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 8b395821cb8d..7d20c1d34a3c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -145,18 +145,6 @@ struct x86_platform_ops x86_platform __ro_after_init = {
EXPORT_SYMBOL_GPL(x86_platform);
-#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi __ro_after_init = {
- .restore_msi_irqs = default_restore_msi_irqs,
-};
-
-/* MSI arch specific hooks */
-void arch_restore_msi_irqs(struct pci_dev *dev)
-{
- x86_msi.restore_msi_irqs(dev);
-}
-#endif
-
struct x86_apic_ops x86_apic_ops __ro_after_init = {
.io_apic_read = native_io_apic_read,
.restore = native_restore_boot_irq_mode,
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 619186138176..2b1548da00eb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -26,7 +26,9 @@ config KVM
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_PFNCACHE
select HAVE_KVM_IRQFD
+ select HAVE_KVM_DIRTY_RING
select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
@@ -36,6 +38,7 @@ config KVM
select KVM_MMIO
select SCHED_INFO
select PERF_EVENTS
+ select GUEST_PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
select HAVE_KVM_NO_POLL
@@ -43,6 +46,7 @@ config KVM
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
select SRCU
+ select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
help
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 75dfd27b6e8a..30f244b64523 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,12 +7,7 @@ ifeq ($(CONFIG_FRAME_POINTER),y)
OBJECT_FILES_NON_STANDARD_vmenter.o := y
endif
-KVM := ../../../virt/kvm
-
-kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
- $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
- $(KVM)/dirty_ring.o $(KVM)/binary_stats.o
-kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
+include $(srctree)/virt/kvm/Makefile.kvm
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 07e9215e911d..28be02adc669 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -32,7 +32,7 @@
u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
EXPORT_SYMBOL_GPL(kvm_cpu_caps);
-static u32 xstate_required_size(u64 xstate_bv, bool compacted)
+u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -42,7 +42,11 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
if (xstate_bv & 0x1) {
u32 eax, ebx, ecx, edx, offset;
cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
- offset = compacted ? ret : ebx;
+ /* ECX[1]: 64B alignment in compacted form */
+ if (compacted)
+ offset = (ecx & 0x2) ? ALIGN(ret, 64) : ret;
+ else
+ offset = ebx;
ret = max(ret, offset + eax);
}
@@ -80,9 +84,12 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
return NULL;
}
-static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
+static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries,
+ int nent)
{
struct kvm_cpuid_entry2 *best;
+ u64 xfeatures;
/*
* The existing code assumes virtual address is 48-bit or 57-bit in the
@@ -96,6 +103,42 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent)
return -EINVAL;
}
+ /*
+ * Exposing dynamic xfeatures to the guest requires additional
+ * enabling in the FPU, e.g. to expand the guest XSAVE state size.
+ */
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ xfeatures = best->eax | ((u64)best->edx << 32);
+ xfeatures &= XFEATURE_MASK_USER_DYNAMIC;
+ if (!xfeatures)
+ return 0;
+
+ return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
+}
+
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -125,14 +168,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
}
}
-static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries, int nent)
{
u32 base = vcpu->arch.kvm_cpuid_base;
if (!base)
return NULL;
- return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
+ return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
+}
+
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+ return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent);
}
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@@ -147,11 +197,28 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+}
+
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
+
+ best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -162,32 +229,52 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
}
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
- best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- best = kvm_find_kvm_cpuid_features(vcpu);
+ best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
+
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
+ if (best) {
+ best->ecx &= guest_supported_xcr0 & 0xffffffff;
+ best->edx &= guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
@@ -206,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
- if (!best)
- vcpu->arch.guest_supported_xcr0 = 0;
- else
- vcpu->arch.guest_supported_xcr0 =
- (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-
- /*
- * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
- * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
- * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
- * at the time of EENTER, thus adjust the allowed XFRM by the guest's
- * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
- * '1' even on CPUs that don't support XSAVE.
- */
- best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
- if (best) {
- best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
- best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
- best->ecx |= XFEATURE_MASK_FPSSE;
- }
+ vcpu->arch.guest_supported_xcr0 =
+ cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
kvm_update_pv_runtime(vcpu);
@@ -276,21 +344,42 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int nent)
{
- int r;
+ int r;
+
+ __kvm_update_cpuid_runtime(vcpu, e2, nent);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ return r;
- r = kvm_check_cpuid(e2, nent);
- if (r)
- return r;
+ kvfree(e2);
+ return 0;
+ }
+
+ r = kvm_check_cpuid(vcpu, e2, nent);
+ if (r)
+ return r;
- kvfree(vcpu->arch.cpuid_entries);
- vcpu->arch.cpuid_entries = e2;
- vcpu->arch.cpuid_nent = nent;
+ kvfree(vcpu->arch.cpuid_entries);
+ vcpu->arch.cpuid_entries = e2;
+ vcpu->arch.cpuid_nent = nent;
- kvm_update_kvm_cpuid_base(vcpu);
- kvm_update_cpuid_runtime(vcpu);
- kvm_vcpu_after_set_cpuid(vcpu);
+ kvm_update_kvm_cpuid_base(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
- return 0;
+ return 0;
}
/* when an old userspace process fills a new kernel module */
@@ -422,9 +511,11 @@ void kvm_set_cpu_caps(void)
#ifdef CONFIG_X86_64
unsigned int f_gbpages = F(GBPAGES);
unsigned int f_lm = F(LM);
+ unsigned int f_xfd = F(XFD);
#else
unsigned int f_gbpages = 0;
unsigned int f_lm = 0;
+ unsigned int f_xfd = 0;
#endif
memset(kvm_cpu_caps, 0, sizeof(kvm_cpu_caps));
@@ -492,7 +583,8 @@ void kvm_set_cpu_caps(void)
F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
- F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16)
+ F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -511,7 +603,7 @@ void kvm_set_cpu_caps(void)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
- F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES)
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
kvm_cpu_cap_init_scattered(CPUID_12_EAX,
@@ -523,7 +615,7 @@ void kvm_set_cpu_caps(void)
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) |
- F(TOPOEXT) | F(PERFCTR_CORE)
+ F(TOPOEXT) | 0 /* PERFCTR_CORE */
);
kvm_cpu_cap_mask(CPUID_8000_0001_EDX,
@@ -637,6 +729,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
case 0x14:
case 0x17:
case 0x18:
+ case 0x1d:
+ case 0x1e:
case 0x1f:
case 0x8000001d:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -770,10 +864,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
perf_get_x86_pmu_capability(&cap);
/*
- * Only support guest architectural pmu on a host
- * with architectural pmu.
+ * The guest architecture pmu is only supported if the architecture
+ * pmu exists on the host and the module parameters allow it.
*/
- if (!cap.version)
+ if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap));
eax.split.version_id = min(cap.version, 2);
@@ -811,12 +905,15 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
- case 0xd:
- entry->eax &= supported_xcr0;
- entry->ebx = xstate_required_size(supported_xcr0, false);
+ case 0xd: {
+ u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xss = supported_xss;
+
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
entry->ecx = entry->ebx;
- entry->edx &= supported_xcr0 >> 32;
- if (!supported_xcr0)
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
break;
entry = do_host_cpuid(array, function, 1);
@@ -825,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_D_1_EAX);
if (entry->eax & (F(XSAVES)|F(XSAVEC)))
- entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
true);
else {
- WARN_ON_ONCE(supported_xss != 0);
+ WARN_ON_ONCE(permitted_xss != 0);
entry->ebx = 0;
}
- entry->ecx &= supported_xss;
- entry->edx &= supported_xss >> 32;
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
for (i = 2; i < 64; ++i) {
bool s_state;
- if (supported_xcr0 & BIT_ULL(i))
+ if (permitted_xcr0 & BIT_ULL(i))
s_state = false;
- else if (supported_xss & BIT_ULL(i))
+ else if (permitted_xss & BIT_ULL(i))
s_state = true;
else
continue;
@@ -852,16 +949,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* invalid sub-leafs. Only valid sub-leafs should
* reach this point, and they should have a non-zero
* save state size. Furthermore, check whether the
- * processor agrees with supported_xcr0/supported_xss
+ * processor agrees with permitted_xcr0/permitted_xss
* on whether this is an XCR0- or IA32_XSS-managed area.
*/
if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
--array->nent;
continue;
}
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
entry->edx = 0;
}
break;
+ }
case 0x12:
/* Intel SGX */
if (!kvm_cpu_cap_has(X86_FEATURE_SGX)) {
@@ -906,6 +1007,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
}
break;
+ /* Intel AMX TILE */
+ case 0x1d:
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) {
+ if (!do_host_cpuid(array, function, i))
+ goto out;
+ }
+ break;
+ case 0x1e: /* TMUL information */
+ if (!kvm_cpu_cap_has(X86_FEATURE_AMX_TILE)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+ break;
case KVM_CPUID_SIGNATURE: {
const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c99edfff7f82..8a770b481d9d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -30,6 +30,8 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
+u32 xstate_required_size(u64 xstate_bv, bool compacted);
+
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 54a83a744538..9240b3b7f8dd 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
int i, j, k, l, ret;
+ if (!kvm_memslots_have_rmaps(kvm))
+ return 0;
+
ret = -ENOMEM;
memset(log, 0, sizeof(log));
for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
@@ -107,9 +110,10 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ int bkt;
+
slots = __kvm_memslots(kvm, i);
- for (j = 0; j < slots->used_slots; j++) {
- slot = &slots->memslots[j];
+ kvm_for_each_memslot(slot, bkt, slots)
for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
rmap = slot->arch.rmap[k];
lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
@@ -121,7 +125,6 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
cur[index]++;
}
}
- }
}
write_unlock(&kvm->mmu_lock);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 28b1a4e57827..5719d8cfdbd9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -175,6 +175,7 @@
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
+#define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -191,8 +192,9 @@
#define FASTOP_SIZE 8
struct opcode {
- u64 flags : 56;
- u64 intercept : 8;
+ u64 flags;
+ u8 intercept;
+ u8 pad[7];
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
const struct opcode *group;
@@ -315,7 +317,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_FUNC(#name)
#define __FOP_RET(name) \
- "ret \n\t" \
+ "11: " ASM_RET \
".size " name ", .-" name "\n\t"
#define FOP_RET(name) \
@@ -344,7 +346,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_RET(#op "_" #dst)
#define FOP1EEX(op, dst) \
- FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
+ FOP1E(op, dst) _ASM_EXTABLE_TYPE_REG(10b, 11b, EX_TYPE_ZERO_REG, %%esi)
#define FASTOP1(op) \
FOP_START(op) \
@@ -434,10 +436,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#op " %al \n\t" \
__FOP_RET(#op)
-asm(".pushsection .fixup, \"ax\"\n"
- "kvm_fastop_exception: xor %esi, %esi; ret\n"
- ".popsection");
-
FOP_START(setcc)
FOP_SETCC(seto)
FOP_SETCC(setno)
@@ -473,12 +471,8 @@ FOP_END;
\
asm volatile("1:" insn "\n" \
"2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: movl $1, %[_fault]\n" \
- " jmp 2b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [_fault] "+qm"(_fault) inoutclob ); \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[_fault]) \
+ : [_fault] "+r"(_fault) inoutclob ); \
\
_fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
})
@@ -4364,10 +4358,10 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps, em_call_far),
- I(SrcMem | NearBranch, em_jmp_abs),
- I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | NearBranch | IsBranch, em_call_near_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far),
+ I(SrcMem | NearBranch | IsBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far),
I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
};
@@ -4577,7 +4571,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
- X16(D(SrcImmByte | NearBranch)),
+ X16(D(SrcImmByte | NearBranch | IsBranch)),
/* 0x80 - 0x87 */
G(ByteOp | DstMem | SrcImm, group1),
G(DstMem | SrcImm, group1),
@@ -4596,7 +4590,7 @@ static const struct opcode opcode_table[256] = {
DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
/* 0x98 - 0x9F */
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
- I(SrcImmFAddr | No64, em_call_far), N,
+ I(SrcImmFAddr | No64 | IsBranch, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
II(ImplicitOps | Stack, em_popf, popf),
I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
@@ -4616,17 +4610,19 @@ static const struct opcode opcode_table[256] = {
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | NearBranch, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm),
+ I(ImplicitOps | NearBranch | IsBranch, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- I(ImplicitOps | SrcImmU16, em_ret_far_imm),
- I(ImplicitOps, em_ret_far),
- D(ImplicitOps), DI(SrcImmByte, intn),
- D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
+ I(Stack | SrcImmU16 | Src2ImmByte | IsBranch, em_enter),
+ I(Stack | IsBranch, em_leave),
+ I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm),
+ I(ImplicitOps | IsBranch, em_ret_far),
+ D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn),
+ D(ImplicitOps | No64 | IsBranch),
+ II(ImplicitOps | IsBranch, em_iret, iret),
/* 0xD0 - 0xD7 */
G(Src2One | ByteOp, group2), G(Src2One, group2),
G(Src2CL | ByteOp, group2), G(Src2CL, group2),
@@ -4637,14 +4633,15 @@ static const struct opcode opcode_table[256] = {
/* 0xD8 - 0xDF */
N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
- X3(I(SrcImmByte | NearBranch, em_loop)),
- I(SrcImmByte | NearBranch, em_jcxz),
+ X3(I(SrcImmByte | NearBranch | IsBranch, em_loop)),
+ I(SrcImmByte | NearBranch | IsBranch, em_jcxz),
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
- I(SrcImmFAddr | No64, em_jmp_far),
- D(SrcImmByte | ImplicitOps | NearBranch),
+ I(SrcImm | NearBranch | IsBranch, em_call),
+ D(SrcImm | ImplicitOps | NearBranch | IsBranch),
+ I(SrcImmFAddr | No64 | IsBranch, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch | IsBranch),
I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
@@ -4660,7 +4657,7 @@ static const struct opcode opcode_table[256] = {
static const struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */
G(0, group6), GD(0, &group7), N, N,
- N, I(ImplicitOps | EmulateOnUD, em_syscall),
+ N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall),
II(ImplicitOps | Priv, em_clts, clts), N,
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
@@ -4691,8 +4688,8 @@ static const struct opcode twobyte_table[256] = {
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
II(ImplicitOps | Priv, em_rdmsr, rdmsr),
IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
- I(ImplicitOps | EmulateOnUD, em_sysenter),
- I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
+ I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter),
+ I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit),
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
@@ -4710,7 +4707,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
- X16(D(SrcImm | NearBranch)),
+ X16(D(SrcImm | NearBranch | IsBranch)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
@@ -5224,6 +5221,8 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
+ ctxt->is_branch = opcode.flags & IsBranch;
+
/* Unrecognised? */
if (ctxt->d == 0)
return EMULATION_FAILED;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 5e19e6e4c2ce..6e38a7d22e97 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -164,7 +164,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
{
struct kvm_vcpu *vcpu = NULL;
- int i;
+ unsigned long i;
if (vpidx >= KVM_MAX_VCPUS)
return NULL;
@@ -1716,7 +1716,8 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
{
struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
- int i, bank, sbank = 0;
+ int bank, sbank = 0;
+ unsigned long i;
memset(vp_bitmap, 0,
KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap));
@@ -1863,7 +1864,7 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
.vector = vector
};
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
@@ -1922,11 +1923,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
+ if (all_cpus)
+ goto check_and_send_ipi;
+
if (!sparse_banks_len)
goto ret_success;
- if (!all_cpus &&
- kvm_read_guest(kvm,
+ if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@@ -1934,6 +1937,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
+check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -2516,6 +2520,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_NESTED_MSR_BITMAP;
break;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 5a69cce4d72d..0b65a764ed3a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -242,7 +242,7 @@ static void pit_do_work(struct kthread_work *work)
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
struct kvm *kvm = pit->kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
struct kvm_kpit_state *ps = &pit->pit_state;
if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 0b80263d46d8..814064d06016 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -50,7 +50,7 @@ static void pic_unlock(struct kvm_pic *s)
{
bool wakeup = s->wakeup_needed;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
s->wakeup_needed = false;
@@ -270,7 +270,8 @@ int kvm_pic_read_irq(struct kvm *kvm)
static void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, i;
+ int irq;
+ unsigned long i;
struct kvm_vcpu *vcpu;
u8 edge_irr = s->irr & ~s->elcr;
bool found = false;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 816a82515dcd..decfa36b7891 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -149,7 +149,7 @@ void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
if (RTC_GSI >= IOAPIC_NUM_PINS)
return;
@@ -184,7 +184,7 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index e66e620c3bed..539333ac4b38 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
- void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 650642b18d15..c2d7cfe82d00 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
- void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index d5b72a08e566..6e0dab04320e 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -24,6 +24,7 @@
#include "hyperv.h"
#include "x86.h"
+#include "xen.h"
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
@@ -45,9 +46,9 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, struct dest_map *dest_map)
{
- int i, r = -1;
+ int r = -1;
struct kvm_vcpu *vcpu, *lowest = NULL;
- unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+ unsigned long i, dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
@@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
return r;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ if (!level)
+ return -1;
+
+ return kvm_xen_set_evtchn_fast(e, kvm);
+#endif
default:
break;
}
@@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm,
e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
e->hv_sint.sint = ue->u.hv_sint.sint;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_IRQ_ROUTING_XEN_EVTCHN:
+ return kvm_xen_setup_evtchn(kvm, e, ue);
+#endif
default:
return -EINVAL;
}
@@ -320,7 +332,8 @@ int kvm_set_routing_entry(struct kvm *kvm,
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
- int i, r = 0;
+ int r = 0;
+ unsigned long i;
struct kvm_vcpu *vcpu;
if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 90e1ffdc05b7..3febc342360c 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,6 +9,12 @@
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
+#define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG)
+#define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)
+#define X86_CR4_PDPTR_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP)
+
+static_assert(!(KVM_POSSIBLE_CR0_GUEST_BITS & X86_CR0_PDPTR_BITS));
+
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
{ \
@@ -37,6 +43,13 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
BUILD_KVM_GPR_ACCESSORS(r15, R15)
#endif
+/*
+ * avail dirty
+ * 0 0 register in VMCS/VMCB
+ * 0 1 *INVALID*
+ * 1 0 register in vcpu->arch
+ * 1 1 register in vcpu->arch, needs to be stored back
+ */
static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
@@ -55,13 +68,6 @@ static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu,
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
}
-static inline void kvm_register_clear_available(struct kvm_vcpu *vcpu,
- enum kvm_reg reg)
-{
- __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
- __clear_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
-}
-
static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 68b420289d7e..39eded2426ff 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -369,6 +369,7 @@ struct x86_emulate_ctxt {
struct fetch_cache fetch;
struct read_cache io_read;
struct read_cache mem_read;
+ bool is_branch;
};
/* Repeat String Operation Prefix */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index c7db2df50a7a..b469f45e3fe4 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -33,7 +33,8 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
{
struct kvm_arch *kvm_arch = &kvm->arch;
struct kvm_vcpu *vcpu;
- int ret = 0, i, nr_unique_valid_roots;
+ int ret = 0, nr_unique_valid_roots;
+ unsigned long i;
hpa_t root;
spin_lock(&kvm_arch->hv_root_tdp_lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 759952dd1222..4662469240bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -185,7 +185,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
@@ -673,41 +673,40 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
-static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
-{
- u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0) {
- printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return false;
- }
- return val & KVM_PV_EOI_ENABLED;
-}
-
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
- printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0)
return;
- }
+
__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+static bool pv_eoi_test_and_clr_pending(struct kvm_vcpu *vcpu)
{
- if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
- printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n",
- (unsigned long long)vcpu->arch.pv_eoi.msr_val);
- return;
- }
+ u8 val;
+
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ return false;
+
+ val &= KVM_PV_EOI_ENABLED;
+
+ if (val && pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0)
+ return false;
+
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+
+ return val;
}
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
- if (apic->vcpu->arch.apicv_active)
+ if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
@@ -1101,6 +1100,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
+ } else {
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
}
break;
@@ -1172,8 +1174,8 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_lapic *src = NULL;
struct kvm_apic_map *map;
struct kvm_vcpu *vcpu;
- unsigned long bitmap;
- int i, vcpu_idx;
+ unsigned long bitmap, i;
+ int vcpu_idx;
bool ret;
rcu_read_lock();
@@ -1931,7 +1933,7 @@ void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
/* If the preempt notifier has already run, it also called apic_timer_expired */
if (!apic->lapic_timer.hv_timer_in_use)
goto out;
- WARN_ON(rcuwait_active(&vcpu->wait));
+ WARN_ON(kvm_vcpu_is_blocking(vcpu));
apic_timer_expired(apic, false);
cancel_hv_timer(apic);
@@ -1948,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
restart_apic_timer(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
@@ -1960,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic);
preempt_enable();
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{
@@ -2629,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
@@ -2677,7 +2677,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
struct kvm_lapic *apic)
{
- bool pending;
int vector;
/*
* PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
@@ -2691,14 +2690,8 @@ static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
* -> host enabled PV EOI, guest executed EOI.
*/
BUG_ON(!pv_eoi_enabled(vcpu));
- pending = pv_eoi_get_pending(vcpu);
- /*
- * Clear pending bit in any case: it will be set again on vmentry.
- * While this might not be ideal from performance point of view,
- * this makes sure pv eoi is only enabled when we know it's safe.
- */
- pv_eoi_clr_pending(vcpu);
- if (pending)
+
+ if (pv_eoi_test_and_clr_pending(vcpu))
return;
vector = apic_set_eoi(apic);
trace_kvm_pv_eoi(apic, vector);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9ae6168d381e..e9fbb2c8bbe2 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,7 +71,8 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
unsigned long cr4, u64 efer, gpa_t nested_cr3);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp);
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
@@ -351,4 +352,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
{
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
+
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
+
+static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu,
+ gpa_t gpa, u32 access,
+ struct x86_exception *exception)
+{
+ if (mmu != &vcpu->arch.nested_mmu)
+ return gpa;
+ return translate_nested_gpa(vcpu, gpa, access, exception);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3be9beea838d..593093b52395 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -335,12 +335,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
- return gpa;
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -1454,7 +1448,7 @@ static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
{
u64 *sptep;
struct rmap_iterator iter;
- int need_flush = 0;
+ bool need_flush = false;
u64 new_spte;
kvm_pfn_t new_pfn;
@@ -1466,7 +1460,7 @@ restart:
rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
- need_flush = 1;
+ need_flush = true;
if (pte_write(pte)) {
pte_list_remove(kvm, rmap_head, sptep);
@@ -1482,7 +1476,7 @@ restart:
if (need_flush && kvm_available_flush_tlb_with_range()) {
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
- return 0;
+ return false;
}
return need_flush;
@@ -1582,7 +1576,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
- flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
+ flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -1623,8 +1617,8 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
for_each_rmap_spte(rmap_head, &iter, sptep)
if (is_accessed_spte(*sptep))
- return 1;
- return 0;
+ return true;
+ return false;
}
#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1936,7 +1930,11 @@ static void mmu_audit_disable(void) { }
static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- return sp->role.invalid ||
+ if (sp->role.invalid)
+ return true;
+
+ /* TDP MMU pages due not use the MMU generation. */
+ return !sp->tdp_mmu_page &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2082,10 +2080,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = direct;
- if (role.direct)
- role.gpte_is_8_bytes = true;
role.access = access;
- if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
+ if (role.has_4_byte_gpte) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
@@ -2173,10 +2169,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
- if (iterator->level == PT64_ROOT_4LEVEL &&
+ if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
- --iterator->level;
+ iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -2561,10 +2557,10 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
return r;
}
-static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
- ++vcpu->kvm->stat.mmu_unsync;
+ ++kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
@@ -2576,7 +2572,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
* were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
* be write-protected.
*/
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
gfn_t gfn, bool can_unsync, bool prefetch)
{
struct kvm_mmu_page *sp;
@@ -2587,7 +2583,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
- if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE))
return -EPERM;
/*
@@ -2596,7 +2592,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
* that case, KVM must complete emulation of the guest TLB flush before
* allowing shadow pages to become unsync (writable by the guest).
*/
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
if (!can_unsync)
return -EPERM;
@@ -2615,7 +2611,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
*/
if (!locked) {
locked = true;
- spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+ spin_lock(&kvm->arch.mmu_unsync_pages_lock);
/*
* Recheck after taking the spinlock, a different vCPU
@@ -2630,10 +2626,10 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
WARN_ON(sp->role.level != PG_LEVEL_4K);
- kvm_unsync_page(vcpu, sp);
+ kvm_unsync_page(kvm, sp);
}
if (locked)
- spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock);
+ spin_unlock(&kvm->arch.mmu_unsync_pages_lock);
/*
* We need to ensure that the marking of unsync pages is visible
@@ -3405,7 +3401,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- int r = 0, i;
+ int r = 0, i, bkt;
/*
* Check if this is the first shadow root being allocated before
@@ -3430,7 +3426,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(slot, slots) {
+ kvm_for_each_memslot(slot, bkt, slots) {
/*
* Both of these functions are no-ops if the target is
* already allocated, so unconditionally calling both
@@ -3730,21 +3726,13 @@ void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
}
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access, struct x86_exception *exception)
-{
- if (exception)
- exception->error_code = 0;
- return vaddr;
-}
-
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t vaddr, u32 access,
+ struct x86_exception *exception)
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3884,7 +3872,7 @@ static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
return true;
return false;
@@ -3976,6 +3964,34 @@ out_retry:
return true;
}
+/*
+ * Returns true if the page fault is stale and needs to be retried, i.e. if the
+ * root was invalidated by a memslot update or a relevant mmu_notifier fired.
+ */
+static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault, int mmu_seq)
+{
+ struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa);
+
+ /* Special roots, e.g. pae_root, are not backed by shadow pages. */
+ if (sp && is_obsolete_sp(vcpu->kvm, sp))
+ return true;
+
+ /*
+ * Roots without an associated shadow page are considered invalid if
+ * there is a pending request to free obsolete roots. The request is
+ * only a hint that the current root _may_ be obsolete and needs to be
+ * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
+ * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
+ * to reload even if no vCPU is actively using the root.
+ */
+ if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu))
+ return true;
+
+ return fault->slot &&
+ mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+}
+
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
@@ -4013,8 +4029,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
else
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
+
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
@@ -4355,22 +4372,28 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly)
+ u64 pa_bits_rsvd, bool execonly, int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+ u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
u64 bad_mt_xwr;
+ if (huge_page_level < PG_LEVEL_1G)
+ large_1g_rsvd = rsvd_bits(7, 7);
+ if (huge_page_level < PG_LEVEL_2M)
+ large_2m_rsvd = rsvd_bits(7, 7);
+
rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4386,10 +4409,11 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
}
static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+ struct kvm_mmu *context, bool execonly, int huge_page_level)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- vcpu->arch.reserved_gpa_bits, execonly);
+ vcpu->arch.reserved_gpa_bits, execonly,
+ huge_page_level);
}
static inline u64 reserved_hpa_bits(void)
@@ -4465,7 +4489,8 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
false, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- reserved_hpa_bits(), false);
+ reserved_hpa_bits(), false,
+ max_huge_page_level);
if (!shadow_me_mask)
return;
@@ -4485,7 +4510,8 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- reserved_hpa_bits(), execonly);
+ reserved_hpa_bits(), execonly,
+ max_huge_page_level);
}
#define BYTE_MASK(access) \
@@ -4734,7 +4760,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.base.ad_disabled = (shadow_accessed_mask == 0);
role.base.level = kvm_mmu_get_tdp_level(vcpu);
role.base.direct = true;
- role.base.gpte_is_8_bytes = true;
+ role.base.has_4_byte_gpte = false;
return role;
}
@@ -4779,7 +4805,7 @@ kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
- role.base.gpte_is_8_bytes = ____is_cr0_pg(regs) && ____is_cr4_pae(regs);
+ role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
return role;
}
@@ -4855,7 +4881,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
- .cr4 = cr4,
+ .cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4878,7 +4904,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
role.base.level = level;
- role.base.gpte_is_8_bytes = true;
+ role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
role.base.guest_mode = true;
@@ -4893,7 +4919,8 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
}
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
- bool accessed_dirty, gpa_t new_eptp)
+ int huge_page_level, bool accessed_dirty,
+ gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
@@ -4919,8 +4946,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
- update_pkru_bitmask(context);
- reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ context->pkru_mask = 0;
+ reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -4984,13 +5011,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
* the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
if (!is_paging(vcpu))
- g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
+ g_context->gva_to_gpa = nonpaging_gva_to_gpa;
else if (is_long_mode(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
else if (is_pae(vcpu))
- g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging64_gva_to_gpa;
else
- g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
+ g_context->gva_to_gpa = paging32_gva_to_gpa;
reset_guest_paging_metadata(vcpu, g_context);
}
@@ -5025,6 +5052,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
+ *
+ * Correctly handling multiple vCPU models with respect to paging and
+ * physical address properties) in a single VM would require tracking
+ * all relevant CPUID information in kvm_mmu_page_role. That is very
+ * undesirable as it would increase the memory requirements for
+ * gfn_track (see struct kvm_mmu_page_role comments). For now that
+ * problem is swept under the rug; KVM's CPUID API is horrific and
+ * it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5032,24 +5067,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
- * sweep the problem under the rug.
- *
- * KVM's horrific CPUID ABI makes the problem all but impossible to
- * solve, as correctly handling multiple vCPU models (with respect to
- * paging and physical address properties) in a single VM would require
- * tracking all relevant CPUID information in kvm_mmu_page_role. That
- * is very undesirable as it would double the memory requirements for
- * gfn_track (see struct kvm_mmu_page_role comments), and in practice
- * no sane VMM mucks with the core vCPU model on the fly.
+ * Changing guest CPUID after KVM_RUN is forbidden, see the comment in
+ * kvm_arch_vcpu_ioctl().
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
- pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
- }
+ KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5161,7 +5182,7 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
gpa, bytes, sp->role.word);
offset = offset_in_page(gpa);
- pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
+ pte_size = sp->role.has_4_byte_gpte ? 4 : 8;
/*
* Sometimes, the OS only writes the last one bytes to update status
@@ -5185,7 +5206,7 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
page_offset = offset_in_page(gpa);
level = sp->role.level;
*nspte = 1;
- if (!sp->role.gpte_is_8_bytes) {
+ if (sp->role.has_4_byte_gpte) {
page_offset <<= 1; /* 32->64 */
/*
* A 32-bit pde maps 4MB while the shadow pdes map
@@ -5369,7 +5390,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
+ kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5497,10 +5518,13 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
mmu->root_hpa = INVALID_PAGE;
mmu->root_pgd = 0;
- mmu->translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
+ /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */
+ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
+ return 0;
+
/*
* When using PAE paging, the four PDPTEs are treated as 'root' pages,
* while the PDP table is a per-vCPU construct that's allocated at MMU
@@ -5510,7 +5534,7 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
* generally doesn't use PAE paging and can skip allocating the PDP
* table. The main exception, handled here, is SVM's 32-bit NPT. The
* other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
- * KVM; that horror is handled on-demand by mmu_alloc_shadow_roots().
+ * KVM; that horror is handled on-demand by mmu_alloc_special_roots().
*/
if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
return 0;
@@ -5555,8 +5579,6 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
-
ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
if (ret)
return ret;
@@ -5715,6 +5737,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
const struct kvm_memory_slot *memslot;
struct kvm_memslots *slots;
+ struct kvm_memslot_iter iter;
bool flush = false;
gfn_t start, end;
int i;
@@ -5724,13 +5747,16 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ memslot = iter.slot;
start = max(gfn_start, memslot->base_gfn);
end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
+ if (WARN_ON_ONCE(start >= end))
continue;
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush);
}
@@ -5748,6 +5774,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
bool flush;
int i;
+ if (WARN_ON_ONCE(gfn_end <= gfn_start))
+ return;
+
write_lock(&kvm->mmu_lock);
kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
@@ -5797,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
/*
- * We can flush all the TLBs out of the mmu lock without TLB
- * corruption since we just change the spte from writable to
- * readonly so that we only need to care the case of changing
- * spte from present to present (changing the spte from present
- * to nonpresent will flush all the TLBs immediately), in other
- * words, the only case we care is mmu_spte_update() where we
- * have checked Host-writable | MMU-writable instead of
- * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
- * anymore.
+ * Flush TLBs if any SPTEs had to be write-protected to ensure that
+ * guest writes are reflected in the dirty bitmap before the memslot
+ * update completes, i.e. before enabling dirty logging is visible to
+ * userspace.
+ *
+ * Perform the TLB flush outside the mmu_lock to reduce the amount of
+ * time the lock is held. However, this does mean that another CPU can
+ * now grab mmu_lock and encounter a write-protected SPTE while CPUs
+ * still have a writable mapping for the associated GFN in their TLB.
+ *
+ * This is safe but requires KVM to be careful when making decisions
+ * based on the write-protection status of an SPTE. Specifically, KVM
+ * also write-protects SPTEs to monitor changes to guest page tables
+ * during shadow paging, and must guarantee no CPUs can write to those
+ * page before the lock is dropped. As mentioned in the previous
+ * paragraph, a write-protected SPTE is no guarantee that CPU cannot
+ * perform writes. So to determine if a TLB flush is truly required, KVM
+ * will clear a separate software-only bit (MMU-writable) and skip the
+ * flush if-and-only-if this bit was already clear.
+ *
+ * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
@@ -5854,8 +5895,6 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
- bool flush = false;
-
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
/*
@@ -5863,17 +5902,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
* logging at a 4k granularity and never creates collapsible
* 2m SPTEs during dirty logging.
*/
- flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
- if (flush)
+ if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
- flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6142,30 +6178,6 @@ out:
return ret;
}
-/*
- * Calculate mmu pages needed for kvm.
- */
-unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
-{
- unsigned long nr_mmu_pages;
- unsigned long nr_pages = 0;
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
- int i;
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
-
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
- }
-
- nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
-
- return nr_mmu_pages;
-}
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
@@ -6182,23 +6194,46 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
+/*
+ * Calculate the effective recovery period, accounting for '0' meaning "let KVM
+ * select a halving time of 1 hour". Returns true if recovery is enabled.
+ */
+static bool calc_nx_huge_pages_recovery_period(uint *period)
+{
+ /*
+ * Use READ_ONCE to get the params, this may be called outside of the
+ * param setters, e.g. by the kthread to compute its next timeout.
+ */
+ bool enabled = READ_ONCE(nx_huge_pages);
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+
+ if (!enabled || !ratio)
+ return false;
+
+ *period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ if (!*period) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ *period = 60 * 60 * 1000 / ratio;
+ }
+ return true;
+}
+
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
bool was_recovery_enabled, is_recovery_enabled;
uint old_period, new_period;
int err;
- was_recovery_enabled = nx_huge_pages_recovery_ratio;
- old_period = nx_huge_pages_recovery_period_ms;
+ was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
if (err)
return err;
- is_recovery_enabled = nx_huge_pages_recovery_ratio;
- new_period = nx_huge_pages_recovery_period_ms;
+ is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
- if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
+ if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
@@ -6262,18 +6297,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
- uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+ bool enabled;
+ uint period;
- if (!period && ratio) {
- /* Make sure the period is not less than one second. */
- ratio = min(ratio, 3600u);
- period = 60 * 60 * 1000 / ratio;
- }
+ enabled = calc_nx_huge_pages_recovery_period(&period);
- return READ_ONCE(nx_huge_pages) && ratio
- ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
- : MAX_SCHEDULE_TIMEOUT;
+ return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
+ : MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 52c6527b1a06..da6166b5c377 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -104,7 +104,7 @@ static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
return kvm_mmu_role_as_id(sp->role);
}
-static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
{
/*
* When using the EPT page-modification log, the GPAs in the CPU dirty
@@ -112,13 +112,12 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
* on write protection to record dirty pages, which bypasses PML, since
* writes now result in a vmexit. Note, the check on CPU dirty logging
* being enabled is mandatory as the bits used to denote WP-only SPTEs
- * are reserved for NPT w/ PAE (32-bit KVM).
+ * are reserved for PAE paging (32-bit KVM).
*/
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu &&
- kvm_x86_ops.cpu_dirty_log_size;
+ return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
gfn_t gfn, bool can_unsync, bool prefetch);
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index b8151bbca36a..de5e8e4e1aa7 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -35,7 +35,7 @@
" %snxe %sad root %u %s%c", \
__entry->mmu_valid_gen, \
__entry->gfn, role.level, \
- role.gpte_is_8_bytes ? 8 : 4, \
+ role.has_4_byte_gpte ? 4 : 8, \
role.quadrant, \
role.direct ? " direct" : "", \
access_str[role.access], \
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index cc4eb5b7fb76..68eb1fb548b6 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -173,9 +173,9 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu,
- struct kvm_memory_slot *slot, gfn_t gfn,
- enum kvm_page_track_mode mode)
+bool kvm_slot_page_track_is_active(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t gfn, enum kvm_page_track_mode mode)
{
int index;
@@ -186,7 +186,7 @@ bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu,
return false;
if (mode == KVM_PAGE_TRACK_WRITE &&
- !kvm_page_track_write_tracking_enabled(vcpu->kvm))
+ !kvm_page_track_write_tracking_enabled(kvm))
return false;
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f87d36898c44..5b5bdac97c7b 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -403,9 +403,8 @@ retry_walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- nested_access,
- &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
+ nested_access, &walker->fault);
/*
* FIXME: This can happen if emulation (for of an INS/OUTS
@@ -467,7 +466,7 @@ retry_walk:
if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == UNMAPPED_GVA)
return 0;
@@ -547,16 +546,6 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
access);
}
-#if PTTYPE != PTTYPE_EPT
-static int FNAME(walk_addr_nested)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr,
- u32 access)
-{
- return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
-}
-#endif
-
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte, bool no_dirty_log)
@@ -911,7 +900,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
+
+ if (is_page_fault_stale(vcpu, fault, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
@@ -999,50 +989,29 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ gpa_t addr, u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, addr, access);
-
- if (r) {
- gpa = gfn_to_gpa(walker.gfn);
- gpa |= addr & ~PAGE_MASK;
- } else if (exception)
- *exception = walker.fault;
-
- return gpa;
-}
-
-#if PTTYPE != PTTYPE_EPT
-/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
- u32 access,
- struct x86_exception *exception)
-{
- struct guest_walker walker;
- gpa_t gpa = UNMAPPED_GVA;
- int r;
-
#ifndef CONFIG_X86_64
/* A 64-bit GVA should be impossible on 32-bit KVM. */
- WARN_ON_ONCE(vaddr >> 32);
+ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu);
#endif
- r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
return gpa;
}
-#endif
/*
* Using the cached information from sp->gfns is safe because:
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 0c76c45fdb68..73cfe62fdad1 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -16,6 +16,7 @@
#include "spte.h"
#include <asm/e820/api.h>
+#include <asm/memtype.h>
#include <asm/vmx.h>
static bool __read_mostly enable_mmio_caching = true;
@@ -90,7 +91,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct kvm_memory_slot *slot,
+ const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte)
@@ -101,7 +102,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED_MASK;
- else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
/*
@@ -161,7 +162,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* e.g. it's write-tracked (upper-level SPs) or has one or more
* shadow pages and unsync'ing pages is not allowed.
*/
- if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) {
+ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
wrprot = true;
@@ -215,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~shadow_host_writable_mask;
+ new_spte &= ~shadow_mmu_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index cc432f9a966b..be6a007a4af3 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
-
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
@@ -79,6 +75,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
+ * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
+ * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
+ * that map guest pages in read-only memslots and read-only VMAs.
+ *
+ * Invariants:
+ * - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
+ *
+ *
+ * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
+ * allows writes to the guest page mapped by the SPTE. This bit is cleared when
+ * the guest page mapped by the SPTE contains a page table that is being
+ * monitored for shadow paging. In this case the SPTE can only be made writable
+ * by unsyncing the shadow page under the mmu_lock.
+ *
+ * Invariants:
+ * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
+ * - If MMU-writable is set, Host-writable must be set.
+ *
+ * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
+ * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
+ * PT_WRITABLE_MASK upon the next write from the guest and record the write in
+ * the dirty log (see fast_page_fault()).
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+
+/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
* to not overlap the A/D type mask or the saved access bits of access-tracked
* SPTEs when A/D bits are disabled.
@@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
- return (spte & shadow_host_writable_mask) &&
- (spte & shadow_mmu_writable_mask);
+ if (spte & shadow_mmu_writable_mask) {
+ WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
+ return true;
+ }
+
+ WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
+ return false;
}
static inline u64 get_mmio_spte_generation(u64 spte)
@@ -330,7 +360,7 @@ static inline u64 get_mmio_spte_generation(u64 spte)
}
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- struct kvm_memory_slot *slot,
+ const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
u64 old_spte, bool prefetch, bool can_unsync,
bool host_writable, u64 *new_spte);
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index b3ed302c1a35..caa96c270b95 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -26,6 +26,7 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
*/
void tdp_iter_restart(struct tdp_iter *iter)
{
+ iter->yielded = false;
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
@@ -160,6 +161,11 @@ static bool try_step_up(struct tdp_iter *iter)
*/
void tdp_iter_next(struct tdp_iter *iter)
{
+ if (iter->yielded) {
+ tdp_iter_restart(iter);
+ return;
+ }
+
if (try_step_down(iter))
return;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1748b988d3a..e19cabbcb65c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -45,6 +45,12 @@ struct tdp_iter {
* iterator walks off the end of the paging structure.
*/
bool valid;
+ /*
+ * True if KVM dropped mmu_lock and yielded in the middle of a walk, in
+ * which case tdp_iter_next() needs to restart the walk at the root
+ * level instead of advancing to the next entry.
+ */
+ bool yielded;
};
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index a54c3491af42..bc9e3553fba2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -165,7 +165,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
role = vcpu->arch.mmu->mmu_role.base;
role.level = level;
role.direct = true;
- role.gpte_is_8_bytes = true;
+ role.has_4_byte_gpte = false;
role.access = ACC_ALL;
role.ad_disabled = !shadow_accessed_mask;
@@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
- u64 old_child_spte;
- u64 *sptep;
- gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- sptep = rcu_dereference(pt) + i;
- gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 *sptep = rcu_dereference(pt) + i;
+ gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
+ u64 old_child_spte;
if (shared) {
/*
@@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -504,6 +502,8 @@ static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter,
u64 new_spte)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_read(&kvm->mmu_lock);
/*
@@ -577,6 +577,8 @@ static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
+ WARN_ON_ONCE(iter->yielded);
+
lockdep_assert_held_write(&kvm->mmu_lock);
/*
@@ -642,18 +644,19 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
* If this function should yield and flush is set, it will perform a remote
* TLB flush before yielding.
*
- * If this function yields, it will also reset the tdp_iter's walk over the
- * paging structure and the calling function should skip to the next
- * iteration to allow the iterator to continue its traversal from the
- * paging structure root.
+ * If this function yields, iter->yielded is set and the caller must skip to
+ * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
+ * over the paging structures to allow the iterator to continue its traversal
+ * from the paging structure root.
*
- * Return true if this function yielded and the iterator's traversal was reset.
- * Return false if a yield was not needed.
+ * Returns true if this function yielded.
*/
-static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
- struct tdp_iter *iter, bool flush,
- bool shared)
+static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter,
+ bool flush, bool shared)
{
+ WARN_ON(iter->yielded);
+
/* Ensure forward progress has been made before yielding. */
if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
@@ -673,12 +676,10 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
WARN_ON(iter->gfn > iter->next_last_level_gfn);
- tdp_iter_restart(iter);
-
- return true;
+ iter->yielded = true;
}
- return false;
+ return iter->yielded;
}
/*
@@ -1033,9 +1034,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
- flush |= zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
+ for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
+ flush = zap_gfn_range(kvm, root, range->start, range->end,
+ range->may_block, flush, false);
return flush;
}
@@ -1364,10 +1365,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
-static bool zap_collapsible_spte_range(struct kvm *kvm,
+static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- const struct kvm_memory_slot *slot,
- bool flush)
+ const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1378,10 +1378,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
- if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
- flush = false;
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
- }
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1393,6 +1391,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
+ /* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1401,30 +1400,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
- flush = true;
}
rcu_read_unlock();
-
- return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush)
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
- flush = zap_collapsible_spte_range(kvm, root, slot, flush);
-
- return flush;
+ zap_collapsible_spte_range(kvm, root, slot);
}
/*
@@ -1449,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- if (!is_writable_pte(iter.old_spte))
- break;
-
new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+ if (new_spte == iter.old_spte)
+ break;
+
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 476b133544dd..3899004a5d91 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- bool flush);
+void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 09873f6488f7..f614f95acc6b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
@@ -55,43 +57,41 @@ static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
kvm_pmu_deliver_pmi(vcpu);
}
-static void kvm_perf_overflow(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
{
- struct kvm_pmc *pmc = perf_event->overflow_handler_context;
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
- }
+ /* Ignore counters that have been reprogrammed already. */
+ if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
+ return;
+
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
+ if (!pmc->intr)
+ return;
+
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu))
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
-static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) {
- __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
-
- /*
- * Inject PMI. If vcpu was in a guest mode during NMI PMI
- * can be ejected on a guest mode re-entry. Otherwise we can't
- * be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until
- * woken up. So we should wake it, but this is impossible from
- * NMI context. Do it from irq work instead.
- */
- if (!kvm_is_in_guest())
- irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
- else
- kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
- }
+ __kvm_perf_overflow(pmc, true);
}
static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
@@ -111,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.config = config,
};
+ if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
+ return;
+
attr.sample_period = get_sample_period(pmc, pmc->counter);
if (in_tx)
@@ -126,7 +129,6 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
}
event = perf_event_create_kernel_counter(&attr, -1, current,
- intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
@@ -138,6 +140,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
+ pmc->intr = intr;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -171,13 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
+static int cmp_u64(const void *a, const void *b)
+{
+ return *(__u64 *)a - *(__u64 *)b;
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
- u8 event_select, unit_mask;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
- int i;
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -192,31 +198,23 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) {
- for (i = 0; i < filter->nevents; i++)
- if (filter->events[i] ==
- (eventsel & AMD64_RAW_EVENT_MASK_NB))
- break;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- i == filter->nevents)
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- i < filter->nevents)
- allow_event = false;
+ __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+ if (bsearch(&key, filter->events, filter->nevents,
+ sizeof(__u64), cmp_u64))
+ allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+ else
+ allow_event = filter->action == KVM_PMU_EVENT_DENY;
}
if (!allow_event)
return;
- event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
ARCH_PERFMON_EVENTSEL_INV |
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED))) {
- config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc),
- event_select,
- unit_mask);
+ config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -268,7 +266,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- kvm_x86_ops.pmu_ops->find_fixed_event(idx),
+ kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
pmi, false, false);
@@ -490,6 +488,66 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
kvm_pmu_reset(vcpu);
}
+static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u64 prev_count;
+
+ prev_count = pmc->counter;
+ pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
+
+ reprogram_counter(pmu, pmc->idx);
+ if (pmc->counter < prev_count)
+ __kvm_perf_overflow(pmc, false);
+}
+
+static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
+ unsigned int perf_hw_id)
+{
+ u64 old_eventsel = pmc->eventsel;
+ unsigned int config;
+
+ pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
+ config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ pmc->eventsel = old_eventsel;
+ return config == perf_hw_id;
+}
+
+static inline bool cpl_is_matched(struct kvm_pmc *pmc)
+{
+ bool select_os, select_user;
+ u64 config = pmc->current_config;
+
+ if (pmc_is_gp(pmc)) {
+ select_os = config & ARCH_PERFMON_EVENTSEL_OS;
+ select_user = config & ARCH_PERFMON_EVENTSEL_USR;
+ } else {
+ select_os = config & 0x1;
+ select_user = config & 0x2;
+ }
+
+ return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
+}
+
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
+ pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+
+ if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ continue;
+
+ /* Ignore checks for edge detect, pin control, invert and CMASK bits */
+ if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
+ kvm_pmu_incr_counter(pmc);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_pmu_event_filter tmp, *filter;
@@ -521,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
/* Ensure nevents can't be changed between the user copies. */
*filter = tmp;
+ /*
+ * Sort the in-kernel list so that we can search it with bsearch.
+ */
+ sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 59d6b76203d5..7a7b8d5b775e 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -24,9 +24,7 @@ struct kvm_event_hw_type_mapping {
};
struct kvm_pmu_ops {
- unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask);
- unsigned (*find_fixed_event)(int idx);
+ unsigned int (*pmc_perf_hw_id)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
@@ -159,6 +157,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index affc0ea98d30..90364d02f22a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -293,15 +293,18 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, source,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK))
kvm_vcpu_wake_up(vcpu);
}
}
@@ -672,16 +675,42 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
return -1;
kvm_lapic_set_irr(vec, vcpu->arch.apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode, which guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that the below code will signal
+ * the doorbell if the vCPU is already running in the guest.
+ */
smp_mb__after_atomic();
- if (avic_vcpu_is_running(vcpu)) {
- int cpuid = vcpu->cpu;
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
+ * is in the guest. If the vCPU is not in the guest, hardware will
+ * automatically process AVIC interrupts at VMRUN.
+ */
+ if (vcpu->mode == IN_GUEST_MODE) {
+ int cpu = READ_ONCE(vcpu->cpu);
- if (cpuid != get_cpu())
- wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+ /*
+ * Note, the vCPU could get migrated to a different pCPU at any
+ * point, which could result in signalling the wrong/previous
+ * pCPU. But if that happens the vCPU is guaranteed to do a
+ * VMRUN (after being migrated) and thus will process pending
+ * interrupts, i.e. a doorbell is not needed (and the spurious
+ * one is harmless).
+ */
+ if (cpu != get_cpu())
+ wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
put_cpu();
- } else
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
kvm_vcpu_wake_up(vcpu);
+ }
return 0;
}
@@ -900,6 +929,7 @@ out:
bool svm_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_NESTED) |
BIT(APICV_INHIBIT_REASON_IRQWIN) |
@@ -948,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
+ lockdep_assert_preemption_disabled();
+
/*
* Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255.
@@ -955,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (svm->avic_is_running)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
- svm->avic_is_running);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -975,40 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
+ lockdep_assert_preemption_disabled();
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ return;
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-/*
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->avic_is_running = is_run;
-
if (!kvm_vcpu_apicv_active(vcpu))
return;
- if (is_run)
- avic_vcpu_load(vcpu, vcpu->cpu);
- else
- avic_vcpu_put(vcpu);
+ preempt_disable();
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_
+ * the vCPU actually blocks.
+ *
+ * Any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source, therefore vIRR will also
+ * be checked by kvm_vcpu_check_block() before blocking. The
+ * memory barrier implicit in set_current_state orders writing
+ * IsRunning=0 before reading the vIRR. The processor needs a
+ * matching memory barrier on interrupt delivery between writing
+ * IRR and reading IsRunning; the lack of this barrier might be
+ * the cause of errata #1235).
+ */
+ avic_vcpu_put(vcpu);
+
+ preempt_enable();
}
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- avic_set_running(vcpu, false);
-}
+ int cpu;
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
- if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
- kvm_vcpu_update_apicv(vcpu);
- avic_set_running(vcpu, true);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
+
+ avic_vcpu_load(vcpu, cpu);
+
+ put_cpu();
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index f8b7bc04b3e7..1218b5a342fc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -58,8 +58,9 @@ static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_excep
struct vcpu_svm *svm = to_svm(vcpu);
WARN_ON(!is_guest_mode(vcpu));
- if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
+ if (vmcb12_is_intercept(&svm->nested.ctl,
+ INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
+ !svm->nested.nested_run_pending) {
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = fault->error_code;
@@ -121,7 +122,8 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
void recalc_intercepts(struct vcpu_svm *svm)
{
- struct vmcb_control_area *c, *h, *g;
+ struct vmcb_control_area *c, *h;
+ struct vmcb_ctrl_area_cached *g;
unsigned int i;
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -163,37 +165,6 @@ void recalc_intercepts(struct vcpu_svm *svm)
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
}
-static void copy_vmcb_control_area(struct vmcb_control_area *dst,
- struct vmcb_control_area *from)
-{
- unsigned int i;
-
- for (i = 0; i < MAX_INTERCEPT; i++)
- dst->intercepts[i] = from->intercepts[i];
-
- dst->iopm_base_pa = from->iopm_base_pa;
- dst->msrpm_base_pa = from->msrpm_base_pa;
- dst->tsc_offset = from->tsc_offset;
- /* asid not copied, it is handled manually for svm->vmcb. */
- dst->tlb_ctl = from->tlb_ctl;
- dst->int_ctl = from->int_ctl;
- dst->int_vector = from->int_vector;
- dst->int_state = from->int_state;
- dst->exit_code = from->exit_code;
- dst->exit_code_hi = from->exit_code_hi;
- dst->exit_info_1 = from->exit_info_1;
- dst->exit_info_2 = from->exit_info_2;
- dst->exit_int_info = from->exit_int_info;
- dst->exit_int_info_err = from->exit_int_info_err;
- dst->nested_ctl = from->nested_ctl;
- dst->event_inj = from->event_inj;
- dst->event_inj_err = from->event_inj_err;
- dst->nested_cr3 = from->nested_cr3;
- dst->virt_ext = from->virt_ext;
- dst->pause_filter_count = from->pause_filter_count;
- dst->pause_filter_thresh = from->pause_filter_thresh;
-}
-
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
@@ -203,7 +174,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
*/
int i;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
@@ -250,10 +221,10 @@ static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
}
}
-static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
- struct vmcb_control_area *control)
+static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
+ struct vmcb_ctrl_area_cached *control)
{
- if (CC(!vmcb_is_intercept(control, INTERCEPT_VMRUN)))
+ if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
return false;
if (CC(control->asid == 0))
@@ -275,9 +246,20 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
return true;
}
-static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
- struct vmcb_save_area *save)
+/* Common checks that apply to both L1 and L2 state. */
+static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
+ struct vmcb_save_area_cached *save)
{
+ if (CC(!(save->efer & EFER_SVME)))
+ return false;
+
+ if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
+ CC(save->cr0 & ~0xffffffffULL))
+ return false;
+
+ if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
+ return false;
+
/*
* These checks are also performed by KVM_SET_SREGS,
* except that EFER.LMA is not checked by SVM against
@@ -293,48 +275,90 @@ static bool nested_vmcb_check_cr3_cr4(struct kvm_vcpu *vcpu,
if (CC(!kvm_is_valid_cr4(vcpu, save->cr4)))
return false;
+ if (CC(!kvm_valid_efer(vcpu, save->efer)))
+ return false;
+
return true;
}
-/* Common checks that apply to both L1 and L2 state. */
-static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu,
- struct vmcb_save_area *save)
+static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
{
- /*
- * FIXME: these should be done after copying the fields,
- * to avoid TOC/TOU races. For these save area checks
- * the possible damage is limited since kvm_set_cr0 and
- * kvm_set_cr4 handle failure; EFER_SVME is an exception
- * so it is force-set later in nested_prepare_vmcb_save.
- */
- if (CC(!(save->efer & EFER_SVME)))
- return false;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area_cached *save = &svm->nested.save;
- if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
- CC(save->cr0 & ~0xffffffffULL))
- return false;
+ return __nested_vmcb_check_save(vcpu, save);
+}
- if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
- return false;
+static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
- if (!nested_vmcb_check_cr3_cr4(vcpu, save))
- return false;
+ return __nested_vmcb_check_controls(vcpu, ctl);
+}
- if (CC(!kvm_valid_efer(vcpu, save->efer)))
- return false;
+static
+void __nested_copy_vmcb_control_to_cache(struct vmcb_ctrl_area_cached *to,
+ struct vmcb_control_area *from)
+{
+ unsigned int i;
- return true;
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ to->intercepts[i] = from->intercepts[i];
+
+ to->iopm_base_pa = from->iopm_base_pa;
+ to->msrpm_base_pa = from->msrpm_base_pa;
+ to->tsc_offset = from->tsc_offset;
+ to->tlb_ctl = from->tlb_ctl;
+ to->int_ctl = from->int_ctl;
+ to->int_vector = from->int_vector;
+ to->int_state = from->int_state;
+ to->exit_code = from->exit_code;
+ to->exit_code_hi = from->exit_code_hi;
+ to->exit_info_1 = from->exit_info_1;
+ to->exit_info_2 = from->exit_info_2;
+ to->exit_int_info = from->exit_int_info;
+ to->exit_int_info_err = from->exit_int_info_err;
+ to->nested_ctl = from->nested_ctl;
+ to->event_inj = from->event_inj;
+ to->event_inj_err = from->event_inj_err;
+ to->nested_cr3 = from->nested_cr3;
+ to->virt_ext = from->virt_ext;
+ to->pause_filter_count = from->pause_filter_count;
+ to->pause_filter_thresh = from->pause_filter_thresh;
+
+ /* Copy asid here because nested_vmcb_check_controls will check it. */
+ to->asid = from->asid;
+ to->msrpm_base_pa &= ~0x0fffULL;
+ to->iopm_base_pa &= ~0x0fffULL;
}
-void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
- struct vmcb_control_area *control)
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control)
{
- copy_vmcb_control_area(&svm->nested.ctl, control);
+ __nested_copy_vmcb_control_to_cache(&svm->nested.ctl, control);
+}
- /* Copy it here because nested_svm_check_controls will check it. */
- svm->nested.ctl.asid = control->asid;
- svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL;
- svm->nested.ctl.iopm_base_pa &= ~0x0fffULL;
+static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
+ struct vmcb_save_area *from)
+{
+ /*
+ * Copy only fields that are validated, as we need them
+ * to avoid TOC/TOU races.
+ */
+ to->efer = from->efer;
+ to->cr0 = from->cr0;
+ to->cr3 = from->cr3;
+ to->cr4 = from->cr4;
+
+ to->dr6 = from->dr6;
+ to->dr7 = from->dr7;
+}
+
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save)
+{
+ __nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
}
/*
@@ -437,14 +461,13 @@ static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
return -EINVAL;
if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
- CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)))
+ CC(!load_pdptrs(vcpu, cr3)))
return -EINVAL;
if (!nested_npt)
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -490,15 +513,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
- /*
- * Force-set EFER_SVME even though it is checked earlier on the
- * VMCB12, because the guest can flip the bit between the check
- * and now. Clearing EFER_SVME would call svm_free_nested.
- */
- svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME);
+ svm_set_efer(&svm->vcpu, svm->nested.save.efer);
- svm_set_cr0(&svm->vcpu, vmcb12->save.cr0);
- svm_set_cr4(&svm->vcpu, vmcb12->save.cr4);
+ svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
+ svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
@@ -513,8 +531,8 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
/* These bits will be set properly on the first execution when new_vmc12 is true */
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
- svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
+ svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
vmcb_mark_dirty(svm->vmcb, VMCB_DR);
}
}
@@ -628,7 +646,7 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_vmcb02_prepare_control(svm);
nested_vmcb02_prepare_save(svm, vmcb12);
- ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
+ ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
nested_npt_enabled(svm), from_vmrun);
if (ret)
return ret;
@@ -678,10 +696,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(!svm->nested.initialized))
return -EINVAL;
- nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- if (!nested_vmcb_valid_sregs(vcpu, &vmcb12->save) ||
- !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) {
+ if (!nested_vmcb_check_save(vcpu) ||
+ !nested_vmcb_check_controls(vcpu)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
vmcb12->control.exit_code_hi = 0;
vmcb12->control.exit_info_1 = 0;
@@ -964,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm)
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
-void svm_leave_nested(struct vcpu_svm *svm)
+void svm_leave_nested(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
@@ -988,7 +1007,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
u32 offset, msr, value;
int write, mask;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
@@ -1015,7 +1034,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
u8 start_bit;
u64 gpa;
- if (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
+ if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
@@ -1046,12 +1065,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -1069,7 +1088,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
break;
}
default: {
- if (vmcb_is_intercept(&svm->nested.ctl, exit_code))
+ if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
vmexit = NESTED_EXIT_DONE;
}
}
@@ -1147,7 +1166,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
}
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
@@ -1251,11 +1270,47 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
}
+/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
+static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
+ struct vmcb_ctrl_area_cached *from)
+{
+ unsigned int i;
+
+ memset(dst, 0, sizeof(struct vmcb_control_area));
+
+ for (i = 0; i < MAX_INTERCEPT; i++)
+ dst->intercepts[i] = from->intercepts[i];
+
+ dst->iopm_base_pa = from->iopm_base_pa;
+ dst->msrpm_base_pa = from->msrpm_base_pa;
+ dst->tsc_offset = from->tsc_offset;
+ dst->asid = from->asid;
+ dst->tlb_ctl = from->tlb_ctl;
+ dst->int_ctl = from->int_ctl;
+ dst->int_vector = from->int_vector;
+ dst->int_state = from->int_state;
+ dst->exit_code = from->exit_code;
+ dst->exit_code_hi = from->exit_code_hi;
+ dst->exit_info_1 = from->exit_info_1;
+ dst->exit_info_2 = from->exit_info_2;
+ dst->exit_int_info = from->exit_int_info;
+ dst->exit_int_info_err = from->exit_int_info_err;
+ dst->nested_ctl = from->nested_ctl;
+ dst->event_inj = from->event_inj;
+ dst->event_inj_err = from->event_inj_err;
+ dst->nested_cr3 = from->nested_cr3;
+ dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
+}
+
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
u32 user_data_size)
{
struct vcpu_svm *svm;
+ struct vmcb_control_area *ctl;
+ unsigned long r;
struct kvm_nested_state kvm_state = {
.flags = 0,
.format = KVM_STATE_NESTED_FORMAT_SVM,
@@ -1297,9 +1352,18 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu,
*/
if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
return -EFAULT;
- if (copy_to_user(&user_vmcb->control, &svm->nested.ctl,
- sizeof(user_vmcb->control)))
+
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ if (!ctl)
+ return -ENOMEM;
+
+ nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
+ r = copy_to_user(&user_vmcb->control, ctl,
+ sizeof(user_vmcb->control));
+ kfree(ctl);
+ if (r)
return -EFAULT;
+
if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
sizeof(user_vmcb->save)))
return -EFAULT;
@@ -1316,6 +1380,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
&user_kvm_nested_state->data.svm[0];
struct vmcb_control_area *ctl;
struct vmcb_save_area *save;
+ struct vmcb_save_area_cached save_cached;
+ struct vmcb_ctrl_area_cached ctl_cached;
unsigned long cr0;
int ret;
@@ -1345,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
}
@@ -1368,7 +1434,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
goto out_free;
ret = -EINVAL;
- if (!nested_vmcb_check_controls(vcpu, ctl))
+ __nested_copy_vmcb_control_to_cache(&ctl_cached, ctl);
+ if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
goto out_free;
/*
@@ -1383,10 +1450,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
*/
+ __nested_copy_vmcb_save_to_cache(&save_cached, save);
if (!(save->cr0 & X86_CR0_PG) ||
!(save->cr0 & X86_CR0_PE) ||
(save->rflags & X86_EFLAGS_VM) ||
- !nested_vmcb_valid_sregs(vcpu, save))
+ !__nested_vmcb_check_save(vcpu, &save_cached))
goto out_free;
/*
@@ -1410,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (is_guest_mode(vcpu))
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
else
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
@@ -1422,7 +1490,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
- nested_load_control_from_vmcb12(svm, ctl);
+ nested_copy_vmcb_control_to_cache(svm, ctl);
svm_switch_vmcb(svm, &svm->nested.vmcb02);
nested_vmcb02_prepare_control(svm);
@@ -1449,7 +1517,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
if (!nested_svm_vmrun_msrpm(svm)) {
@@ -1464,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
}
struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 871c426ec389..5aa45f13b16d 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -16,6 +16,7 @@
#include "cpuid.h"
#include "lapic.h"
#include "pmu.h"
+#include "svm.h"
enum pmu_type {
PMU_TYPE_COUNTER = 0,
@@ -100,6 +101,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
+ if (!enable_pmu)
+ return NULL;
+
switch (msr) {
case MSR_F15H_PERF_CTL0:
case MSR_F15H_PERF_CTL1:
@@ -134,12 +138,16 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return &pmu->gp_counters[msr_to_index(msr)];
}
-static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
+ /* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+ if (WARN_ON(pmc_is_fixed(pmc)))
+ return PERF_COUNT_HW_MAX;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
if (amd_event_mapping[i].eventsel == event_select
&& amd_event_mapping[i].unit_mask == unit_mask)
@@ -151,12 +159,6 @@ static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
return amd_event_mapping[i].event_type;
}
-/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
-static unsigned amd_find_fixed_event(int idx)
-{
- return PERF_COUNT_HW_MAX;
-}
-
/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
* AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
*/
@@ -281,7 +283,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
- pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->reserved_bits = 0xfffffff000280000ull;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
@@ -319,8 +321,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops amd_pmu_ops = {
- .find_arch_event = amd_find_arch_event,
- .find_fixed_event = amd_find_fixed_event,
+ .pmc_perf_hw_id = amd_pmc_perf_hw_id,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 21ac0a5de4e0..17b53457d866 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -636,7 +636,8 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
struct kvm_vcpu *vcpu;
- int i, ret;
+ unsigned long i;
+ int ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
@@ -1543,35 +1544,57 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}
-static int sev_lock_for_migration(struct kvm *kvm)
+static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
+ int r = -EBUSY;
+
+ if (dst_kvm == src_kvm)
+ return -EINVAL;
/*
- * Bail if this VM is already involved in a migration to avoid deadlock
- * between two VMs trying to migrate to/from each other.
+ * Bail if these VMs are already involved in a migration to avoid
+ * deadlock between two VMs trying to migrate to/from each other.
*/
- if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1))
+ if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
- mutex_lock(&kvm->lock);
+ if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
+ goto release_dst;
+ r = -EINTR;
+ if (mutex_lock_killable(&dst_kvm->lock))
+ goto release_src;
+ if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING))
+ goto unlock_dst;
return 0;
+
+unlock_dst:
+ mutex_unlock(&dst_kvm->lock);
+release_src:
+ atomic_set_release(&src_sev->migration_in_progress, 0);
+release_dst:
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ return r;
}
-static void sev_unlock_after_migration(struct kvm *kvm)
+static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
+ struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
- mutex_unlock(&kvm->lock);
- atomic_set_release(&sev->migration_in_progress, 0);
+ mutex_unlock(&dst_kvm->lock);
+ mutex_unlock(&src_kvm->lock);
+ atomic_set_release(&dst_sev->migration_in_progress, 0);
+ atomic_set_release(&src_sev->migration_in_progress, 0);
}
static int sev_lock_vcpus_for_migration(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i, j;
+ unsigned long i, j;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (mutex_lock_killable(&vcpu->mutex))
@@ -1593,7 +1616,7 @@ out_unlock:
static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
mutex_unlock(&vcpu->mutex);
@@ -1607,19 +1630,20 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
+ dst->enc_context_owner = src->enc_context_owner;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
+ src->enc_context_owner = NULL;
- INIT_LIST_HEAD(&dst->regions_list);
- list_replace_init(&src->regions_list, &dst->regions_list);
+ list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
{
- int i;
+ unsigned long i;
struct kvm_vcpu *dst_vcpu, *src_vcpu;
struct vcpu_svm *dst_svm, *src_svm;
@@ -1666,15 +1690,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
- ret = sev_lock_for_migration(kvm);
- if (ret)
- return ret;
-
- if (sev_guest(kvm)) {
- ret = -EINVAL;
- goto out_unlock;
- }
-
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
@@ -1682,16 +1697,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
}
source_kvm = source_kvm_file->private_data;
- ret = sev_lock_for_migration(source_kvm);
+ ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
- if (!sev_guest(source_kvm)) {
+ if (sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
- goto out_source;
+ goto out_unlock;
}
src_sev = &to_kvm_svm(source_kvm)->sev_info;
+
+ /*
+ * VMs mirroring src's encryption context rely on it to keep the
+ * ASID allocated, but below we are clearing src_sev->asid.
+ */
+ if (src_sev->num_mirrored_vms) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1728,13 +1753,11 @@ out_dst_cgroup:
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
-out_source:
- sev_unlock_after_migration(source_kvm);
+out_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
-out_unlock:
- sev_unlock_after_migration(kvm);
return ret;
}
@@ -1953,76 +1976,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info source_sev, *mirror_sev;
+ struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
- goto e_source_put;
+ goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
- mutex_lock(&source_kvm->lock);
-
- if (!sev_guest(source_kvm)) {
- ret = -EINVAL;
- goto e_source_unlock;
- }
+ ret = sev_lock_two_vms(kvm, source_kvm);
+ if (ret)
+ goto e_source_fput;
- /* Mirrors of mirrors should work, but let's not get silly */
- if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
+ /*
+ * Mirrors of mirrors should work, but let's not get silly. Also
+ * disallow out-of-band SEV/SEV-ES init if the target is already an
+ * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
+ * created after SEV/SEV-ES initialization, e.g. to init intercepts.
+ */
+ if (sev_guest(kvm) || !sev_guest(source_kvm) ||
+ is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
- goto e_source_unlock;
+ goto e_unlock;
}
- memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
- sizeof(source_sev));
-
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
+ source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
-
- fput(source_kvm_file);
- mutex_unlock(&source_kvm->lock);
- mutex_lock(&kvm->lock);
-
- /*
- * Disallow out-of-band SEV/SEV-ES init if the target is already an
- * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
- * created after SEV/SEV-ES initialization, e.g. to init intercepts.
- */
- if (sev_guest(kvm) || kvm->created_vcpus) {
- ret = -EINVAL;
- goto e_mirror_unlock;
- }
+ source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
- mirror_sev->asid = source_sev.asid;
- mirror_sev->fd = source_sev.fd;
- mirror_sev->es_active = source_sev.es_active;
- mirror_sev->handle = source_sev.handle;
+ mirror_sev->asid = source_sev->asid;
+ mirror_sev->fd = source_sev->fd;
+ mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->handle = source_sev->handle;
+ INIT_LIST_HEAD(&mirror_sev->regions_list);
+ ret = 0;
+
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
- mutex_unlock(&kvm->lock);
- return 0;
-
-e_mirror_unlock:
- mutex_unlock(&kvm->lock);
- kvm_put_kvm(source_kvm);
- return ret;
-e_source_unlock:
- mutex_unlock(&source_kvm->lock);
-e_source_put:
+e_unlock:
+ sev_unlock_two_vms(kvm, source_kvm);
+e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -2034,17 +2041,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
+ WARN_ON(sev->num_mirrored_vms);
+
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
- kvm_put_kvm(sev->enc_context_owner);
+ struct kvm *owner_kvm = sev->enc_context_owner;
+ struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
+
+ mutex_lock(&owner_kvm->lock);
+ if (!WARN_ON(!owner_sev->num_mirrored_vms))
+ owner_sev->num_mirrored_vms--;
+ mutex_unlock(&owner_kvm->lock);
+ kvm_put_kvm(owner_kvm);
return;
}
- mutex_lock(&kvm->lock);
-
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -2064,8 +2078,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
- mutex_unlock(&kvm->lock);
-
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}
@@ -2088,8 +2100,13 @@ void __init sev_hardware_setup(void)
if (!sev_enabled || !npt_enabled)
goto out;
- /* Does the CPU support SEV? */
- if (!boot_cpu_has(X86_FEATURE_SEV))
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
goto out;
/* Retrieve SEV CPUID information */
@@ -2249,7 +2266,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu)
__free_page(virt_to_page(svm->sev_es.vmsa));
if (svm->sev_es.ghcb_sa_free)
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
}
static void dump_ghcb(struct vcpu_svm *svm)
@@ -2341,24 +2358,29 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm)
memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
}
-static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
+static bool sev_es_validate_vmgexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu;
struct ghcb *ghcb;
- u64 exit_code = 0;
+ u64 exit_code;
+ u64 reason;
ghcb = svm->sev_es.ghcb;
- /* Only GHCB Usage code 0 is supported */
- if (ghcb->ghcb_usage)
- goto vmgexit_err;
-
/*
- * Retrieve the exit code now even though is may not be marked valid
+ * Retrieve the exit code now even though it may not be marked valid
* as it could help with debugging.
*/
exit_code = ghcb_get_sw_exit_code(ghcb);
+ /* Only GHCB Usage code 0 is supported */
+ if (ghcb->ghcb_usage) {
+ reason = GHCB_ERR_INVALID_USAGE;
+ goto vmgexit_err;
+ }
+
+ reason = GHCB_ERR_MISSING_INPUT;
+
if (!ghcb_sw_exit_code_is_valid(ghcb) ||
!ghcb_sw_exit_info_1_is_valid(ghcb) ||
!ghcb_sw_exit_info_2_is_valid(ghcb))
@@ -2437,30 +2459,34 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
break;
default:
+ reason = GHCB_ERR_INVALID_EVENT;
goto vmgexit_err;
}
- return 0;
+ return true;
vmgexit_err:
vcpu = &svm->vcpu;
- if (ghcb->ghcb_usage) {
+ if (reason == GHCB_ERR_INVALID_USAGE) {
vcpu_unimpl(vcpu, "vmgexit: ghcb usage %#x is not valid\n",
ghcb->ghcb_usage);
+ } else if (reason == GHCB_ERR_INVALID_EVENT) {
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx is not valid\n",
+ exit_code);
} else {
- vcpu_unimpl(vcpu, "vmgexit: exit reason %#llx is not valid\n",
+ vcpu_unimpl(vcpu, "vmgexit: exit code %#llx input is not valid\n",
exit_code);
dump_ghcb(svm);
}
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_code;
- vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ /* Clear the valid entries fields */
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, reason);
- return -EINVAL;
+ return false;
}
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
@@ -2482,7 +2508,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm)
svm->sev_es.ghcb_sa_sync = false;
}
- kfree(svm->sev_es.ghcb_sa);
+ kvfree(svm->sev_es.ghcb_sa);
svm->sev_es.ghcb_sa = NULL;
svm->sev_es.ghcb_sa_free = false;
}
@@ -2530,14 +2556,14 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_beg = ghcb_get_sw_scratch(ghcb);
if (!scratch_gpa_beg) {
pr_err("vmgexit: scratch gpa not provided\n");
- return false;
+ goto e_scratch;
}
scratch_gpa_end = scratch_gpa_beg + len;
if (scratch_gpa_end < scratch_gpa_beg) {
pr_err("vmgexit: scratch length (%#llx) not valid for scratch address (%#llx)\n",
len, scratch_gpa_beg);
- return false;
+ goto e_scratch;
}
if ((scratch_gpa_beg & PAGE_MASK) == control->ghcb_gpa) {
@@ -2555,7 +2581,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
scratch_gpa_end > ghcb_scratch_end) {
pr_err("vmgexit: scratch area is outside of GHCB shared buffer area (%#llx - %#llx)\n",
scratch_gpa_beg, scratch_gpa_end);
- return false;
+ goto e_scratch;
}
scratch_va = (void *)svm->sev_es.ghcb;
@@ -2568,18 +2594,18 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
if (len > GHCB_SCRATCH_AREA_LIMIT) {
pr_err("vmgexit: scratch area exceeds KVM limits (%#llx requested, %#llx limit)\n",
len, GHCB_SCRATCH_AREA_LIMIT);
- return false;
+ goto e_scratch;
}
- scratch_va = kzalloc(len, GFP_KERNEL_ACCOUNT);
+ scratch_va = kvzalloc(len, GFP_KERNEL_ACCOUNT);
if (!scratch_va)
- return false;
+ goto e_scratch;
if (kvm_read_guest(svm->vcpu.kvm, scratch_gpa_beg, scratch_va, len)) {
/* Unable to copy scratch area from guest */
pr_err("vmgexit: kvm_read_guest for scratch area failed\n");
- kfree(scratch_va);
- return false;
+ kvfree(scratch_va);
+ goto e_scratch;
}
/*
@@ -2596,6 +2622,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
svm->sev_es.ghcb_sa_len = len;
return true;
+
+e_scratch:
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_SCRATCH_AREA);
+
+ return false;
}
static void set_ghcb_msr_bits(struct vcpu_svm *svm, u64 value, u64 mask,
@@ -2646,7 +2678,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_CPUID);
if (!ret) {
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
break;
}
@@ -2682,10 +2714,13 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_TERM_REASON_POS);
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- fallthrough;
+
+ ret = -EINVAL;
+ break;
}
default:
- ret = -EINVAL;
+ /* Error, keep GHCB MSR value as-is */
+ break;
}
trace_kvm_vmgexit_msr_protocol_exit(svm->vcpu.vcpu_id,
@@ -2709,14 +2744,18 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
if (!ghcb_gpa) {
vcpu_unimpl(vcpu, "vmgexit: GHCB gpa is not set\n");
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) {
/* Unable to map GHCB from guest */
vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n",
ghcb_gpa);
- return -EINVAL;
+
+ /* Without a GHCB, just return right back to the guest */
+ return 1;
}
svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva;
@@ -2726,15 +2765,14 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
exit_code = ghcb_get_sw_exit_code(ghcb);
- ret = sev_es_validate_vmgexit(svm);
- if (ret)
- return ret;
+ if (!sev_es_validate_vmgexit(svm))
+ return 1;
sev_es_sync_from_ghcb(svm);
ghcb_set_sw_exit_info_1(ghcb, 0);
ghcb_set_sw_exit_info_2(ghcb, 0);
- ret = -EINVAL;
+ ret = 1;
switch (exit_code) {
case SVM_VMGEXIT_MMIO_READ:
if (!setup_vmgexit_scratch(svm, true, control->exit_info_2))
@@ -2775,20 +2813,17 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
default:
pr_err("svm: vmgexit: unsupported AP jump table request - exit_info_1=%#llx\n",
control->exit_info_1);
- ghcb_set_sw_exit_info_1(ghcb, 1);
- ghcb_set_sw_exit_info_2(ghcb,
- X86_TRAP_UD |
- SVM_EVTINJ_TYPE_EXEPT |
- SVM_EVTINJ_VALID);
+ ghcb_set_sw_exit_info_1(ghcb, 2);
+ ghcb_set_sw_exit_info_2(ghcb, GHCB_ERR_INVALID_INPUT);
}
- ret = 1;
break;
}
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
control->exit_info_1, control->exit_info_2);
+ ret = -EINVAL;
break;
default:
ret = svm_invoke_exit_handler(vcpu, exit_code);
@@ -2810,7 +2845,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
return -EINVAL;
if (!setup_vmgexit_scratch(svm, in, bytes))
- return -EINVAL;
+ return 1;
return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa,
count, in);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5630c241d5f6..6d97629655e3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -265,7 +265,7 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static int get_max_npt_level(void)
+static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
@@ -290,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
if (!(efer & EFER_SVME)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, true);
/* #GP intercept is still needed for vmware backdoor */
if (!enable_vmware_backdoor)
@@ -312,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return ret;
}
- if (svm_gp_erratum_intercept)
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -585,12 +589,10 @@ static int svm_cpu_init(int cpu)
if (!sd)
return ret;
sd->cpu = cpu;
- sd->save_area = alloc_page(GFP_KERNEL);
+ sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!sd->save_area)
goto free_cpu_data;
- clear_page(page_address(sd->save_area));
-
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
@@ -871,47 +873,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-/*
- * The default MMIO mask is a single bit (excluding the present bit),
- * which could conflict with the memory encryption bit. Check for
- * memory encryption support and override the default MMIO mask if
- * memory encryption is enabled.
- */
-static __init void svm_adjust_mmio_mask(void)
-{
- unsigned int enc_bit, mask_bit;
- u64 msr, mask;
-
- /* If there is no memory encryption support, use existing mask */
- if (cpuid_eax(0x80000000) < 0x8000001f)
- return;
-
- /* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_AMD64_SYSCFG, msr);
- if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
- return;
-
- enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
- mask_bit = boot_cpu_data.x86_phys_bits;
-
- /* Increment the mask bit if it is the same as the encryption bit */
- if (enc_bit == mask_bit)
- mask_bit++;
-
- /*
- * If the mask bit location is below 52, then some bits above the
- * physical addressing limit will always be reserved, so use the
- * rsvd_bits() function to generate the mask. This mask, along with
- * the present bit, will be used to generate a page fault with
- * PFER.RSV = 1.
- *
- * If the mask bit location is 52 (or above), then clear the mask.
- */
- mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
-}
-
static void svm_hardware_teardown(void)
{
int cpu;
@@ -926,191 +887,6 @@ static void svm_hardware_teardown(void)
iopm_base = 0;
}
-static __init void svm_set_cpu_caps(void)
-{
- kvm_set_cpu_caps();
-
- supported_xss = 0;
-
- /* CPUID 0x80000001 and 0x8000000A (SVM features) */
- if (nested) {
- kvm_cpu_cap_set(X86_FEATURE_SVM);
-
- if (nrips)
- kvm_cpu_cap_set(X86_FEATURE_NRIPS);
-
- if (npt_enabled)
- kvm_cpu_cap_set(X86_FEATURE_NPT);
-
- if (tsc_scaling)
- kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
-
- /* Nested VM can receive #VMEXIT instead of triggering #GP */
- kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
- }
-
- /* CPUID 0x80000008 */
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
- boot_cpu_has(X86_FEATURE_AMD_SSBD))
- kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-
- /* CPUID 0x8000001F (SME/SEV features) */
- sev_set_cpu_caps();
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
-
- /*
- * NX is required for shadow paging and for NPT if the NX huge pages
- * mitigation is enabled.
- */
- if (!boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("NX (Execute Disable) not supported\n");
- return -EOPNOTSUPP;
- }
- kvm_enable_efer_bits(EFER_NX);
-
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- init_msrpm_offsets();
-
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (tsc_scaling) {
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- tsc_scaling = false;
- } else {
- pr_info("TSC scaling supported\n");
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
- }
- }
-
- tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
-
- /* Check for pause filtering support */
- if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- pause_filter_count = 0;
- pause_filter_thresh = 0;
- } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
- pause_filter_thresh = 0;
- }
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
- }
-
- /*
- * KVM's MMU doesn't support using 2-level paging for itself, and thus
- * NPT isn't supported if the host is using 2-level paging since host
- * CR4 is unchanged on VMRUN.
- */
- if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
- npt_enabled = false;
-
- if (!boot_cpu_has(X86_FEATURE_NPT))
- npt_enabled = false;
-
- /* Force VM NPT level equal to the host's max NPT level */
- kvm_configure_mmu(npt_enabled, get_max_npt_level(),
- get_max_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
-
- /* Note, SEV setup consumes npt_enabled. */
- sev_hardware_setup();
-
- svm_hv_hardware_setup();
-
- svm_adjust_mmio_mask();
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
- enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
-
- if (enable_apicv) {
- pr_info("AVIC enabled\n");
-
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- }
-
- if (vls) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
- !IS_ENABLED(CONFIG_X86_64)) {
- vls = false;
- } else {
- pr_info("Virtual VMLOAD VMSAVE supported\n");
- }
- }
-
- if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
- svm_gp_erratum_intercept = false;
-
- if (vgif) {
- if (!boot_cpu_has(X86_FEATURE_VGIF))
- vgif = false;
- else
- pr_info("Virtual GIF supported\n");
- }
-
- if (lbrv) {
- if (!boot_cpu_has(X86_FEATURE_LBRV))
- lbrv = false;
- else
- pr_info("LBR virtualization supported\n");
- }
-
- svm_set_cpu_caps();
-
- /*
- * It seems that on AMD processors PTE's accessed bit is
- * being set by the CPU hardware before the NPF vmexit.
- * This is not expected behaviour and our tests fail because
- * of it.
- * A workaround here is to disable support for
- * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
- * In this case userspace can know if there is support using
- * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
- * it
- * If future AMD CPU models change the behaviour described above,
- * this variable can be changed accordingly
- */
- allow_smaller_maxphyaddr = !npt_enabled;
-
- return 0;
-
-err:
- svm_hardware_teardown();
- return r;
-}
-
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
@@ -1238,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does.
+ * as VMware does. Don't intercept #GP for SEV guests as KVM can't
+ * decrypt guest memory to decode the faulting instruction.
*/
- if (enable_vmware_backdoor)
+ if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
@@ -1435,12 +1212,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (err)
goto error_free_vmsa_page;
- /* We initialize this flag to true to make sure that the is_running
- * bit would be set the first time the vcpu is loaded.
- */
- if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
- svm->avic_is_running = true;
-
svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
err = -ENOMEM;
@@ -1585,12 +1356,27 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static bool svm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = to_svm(vcpu)->vmcb;
+
+ return sev_es_guest(vcpu->kvm)
+ ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK
+ : kvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_EXREG_PDPTR:
- BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ /*
+ * When !npt_enabled, mmu->pdptrs[] is already available since
+ * it is always updated per SDM when moving to CRs.
+ */
+ if (npt_enabled)
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
break;
default:
KVM_BUG_ON(1, vcpu->kvm);
@@ -1777,6 +1563,24 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
vmcb_mark_dirty(svm->vmcb, VMCB_DT);
}
+static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * For guests that don't set guest_state_protected, the cr3 update is
+ * handled via kvm_mmu_load() while entering the guest. For guests
+ * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to
+ * VMCB save area now, since the save area will become the initial
+ * contents of the VMSA, and future VMCB save area updates won't be
+ * seen.
+ */
+ if (sev_es_guest(vcpu->kvm)) {
+ svm->vmcb->save.cr3 = cr3;
+ vmcb_mark_dirty(svm->vmcb, VMCB_CR);
+ }
+}
+
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2292,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (error_code)
goto reinject;
- /* All SVM instructions expect page aligned RAX */
- if (svm->vmcb->save.rax & ~PAGE_MASK)
- goto reinject;
-
/* Decode the instruction for usage later */
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
goto reinject;
@@ -2313,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu))
return kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
- } else
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
return emulate_svm_instr(vcpu, opcode);
+ }
reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -2508,7 +2313,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
bool ret = false;
if (!is_guest_mode(vcpu) ||
- (!(vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
+ (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))))
return false;
cr0 &= ~SVM_CR0_SELECTIVE_MASK;
@@ -3568,14 +3373,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (!gif_set(svm))
return true;
- if (sev_es_guest(vcpu->kvm)) {
- /*
- * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask
- * bit to determine the state of the IF flag.
- */
- if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK))
- return true;
- } else if (is_guest_mode(vcpu)) {
+ if (is_guest_mode(vcpu)) {
/* As long as interrupts are being delivered... */
if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK)
? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)
@@ -3586,7 +3384,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
if (nested_exit_on_intr(svm))
return false;
} else {
- if (!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF))
+ if (!svm_get_if_flag(vcpu))
return true;
}
@@ -3798,6 +3596,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
}
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@@ -3929,9 +3732,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
}
+ vcpu->arch.regs_dirty = 0;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
- kvm_before_interrupt(vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
kvm_load_host_xsave_state(vcpu);
stgi();
@@ -3963,8 +3767,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.apf.host_apf_flags =
kvm_read_and_reset_apf_flags();
- if (npt_enabled)
- kvm_register_clear_available(vcpu, VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET;
/*
* We need to handle MC intercepts here before the vcpu has a chance to
@@ -3994,9 +3797,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
hv_track_root_tdp(vcpu, root_hpa);
- /* Loading L2's CR3 is handled by enter_svm_guest_mode. */
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- return;
cr3 = vcpu->arch.cr3;
} else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
@@ -4215,7 +4015,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
info->intercept == x86_intercept_clts)
break;
- if (!(vmcb_is_intercept(&svm->nested.ctl,
+ if (!(vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_SELECTIVE_CR0)))
break;
@@ -4434,7 +4234,8 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
*/
vmcb12 = map.hva;
- nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
+ nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
unmap_save:
@@ -4457,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
}
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
unsigned long cr4;
+ u64 error_code;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
/*
- * When the guest is an SEV-ES guest, emulation is not possible.
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
return false;
/*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return true;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * decode garbage.
+ *
+ * Inject #UD if KVM reached this point without an instruction buffer.
+ * In practice, this path should never be hit by a well-behaved guest,
+ * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
+ * is still theoretically reachable, e.g. via unaccelerated fault-like
+ * AVIC access, and needs to be handled by KVM to avoid putting the
+ * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
+ * but its the least awful option given lack of insight into the guest.
+ */
+ if (unlikely(!insn)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return true;
+
+ /*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
- * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
- * possible that CPU microcode implementing DecodeAssist will fail
- * to read bytes of instruction which caused #NPF. In this case,
- * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
- * return 0 instead of the correct guest instruction bytes.
- *
- * This happens because CPU microcode reading instruction bytes
- * uses a special opcode which attempts to read data using CPL=0
- * privileges. The microcode reads CS:RIP and if it hits a SMAP
- * fault, it gives up and returns no instruction bytes.
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
*
- * Detection:
- * We reach here in case CPU supports DecodeAssist, raised #NPF and
- * returned 0 in GuestIntrBytes field of the VMCB.
- * First, errata can only be triggered in case vCPU CR4.SMAP=1.
- * Second, if vCPU CR4.SMEP=1, errata could only be triggered
- * in case vCPU CPL==3 (Because otherwise guest would have triggered
- * a SMEP fault instead of #NPF).
- * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
- * As most guests enable SMAP if they have also enabled SMEP, use above
- * logic in order to attempt minimize false-positive of detecting errata
- * while still preserving all cases semantic correctness.
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
*
- * Workaround:
- * To determine what instruction the guest was executing, the hypervisor
- * will have to decode the instruction at the instruction pointer.
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
*
- * In non SEV guest, hypervisor will be able to read the guest
- * memory to decode the instruction pointer when insn_len is zero
- * so we return true to indicate that decoding is possible.
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
*
- * But in the SEV guest, the guest memory is encrypted with the
- * guest specific key and hypervisor will not be able to decode the
- * instruction pointer so we will not able to workaround it. Lets
- * print the error and request to kill the guest.
- */
- if (likely(!insn || insn_len))
- return true;
-
- /*
- * If RIP is invalid, go ahead with emulation which will cause an
- * internal error exit.
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
*/
- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
- return true;
+ error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
cr4 = kvm_read_cr4(vcpu);
smep = cr4 & X86_CR4_SMEP;
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- if (!sev_guest(vcpu->kvm))
- return true;
-
pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
}
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
return false;
}
@@ -4596,8 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .vcpu_blocking = svm_vcpu_blocking,
- .vcpu_unblocking = svm_vcpu_unblocking,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
@@ -4609,6 +4471,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .post_set_cr3 = svm_post_set_cr3,
.is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
@@ -4621,12 +4484,14 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .get_if_flag = svm_get_if_flag,
.tlb_flush_all = svm_flush_tlb,
.tlb_flush_current = svm_flush_tlb,
.tlb_flush_gva = svm_flush_tlb_gva,
.tlb_flush_guest = svm_flush_tlb,
+ .vcpu_pre_run = svm_vcpu_pre_run,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
@@ -4651,7 +4516,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
- .sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,
@@ -4708,6 +4572,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
};
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ supported_xss = 0;
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* AMD PMU PERFCTR_CORE CPUID */
+ if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+ unsigned int order = get_order(IOPM_SIZE);
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
+ }
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+ /* Note, SEV setup consumes npt_enabled. */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (nrips) {
+ if (!boot_cpu_has(X86_FEATURE_NRIPS))
+ nrips = false;
+ }
+
+ enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+
+ if (enable_apicv) {
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ } else {
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ return 0;
+
+err:
+ svm_hardware_teardown();
+ return r;
+}
+
+
static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 5faad3dc10e2..73525353e424 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,6 +79,7 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
+ unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};
@@ -104,6 +105,40 @@ struct kvm_vmcb_info {
uint64_t asid_generation;
};
+struct vmcb_save_area_cached {
+ u64 efer;
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+};
+
+struct vmcb_ctrl_area_cached {
+ u32 intercepts[MAX_INTERCEPT];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+};
+
struct svm_nested_state {
struct kvm_vmcb_info vmcb02;
u64 hsave_msr;
@@ -119,7 +154,13 @@ struct svm_nested_state {
bool nested_run_pending;
/* cache for control fields of the guest */
- struct vmcb_control_area ctl;
+ struct vmcb_ctrl_area_cached ctl;
+
+ /*
+ * Note: this struct is not kept up-to-date while L2 runs; it is only
+ * valid within nested_svm_vmrun.
+ */
+ struct vmcb_save_area_cached save;
bool initialized;
};
@@ -184,7 +225,6 @@ struct vcpu_svm {
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
- bool avic_is_running;
/*
* Per-vcpu list of struct amd_svm_iommu_ir:
@@ -264,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
& ~VMCB_ALWAYS_DIRTY_MASK;
}
-static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
-{
- return (vmcb->control.clean & (1 << bit));
-}
-
static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
@@ -284,6 +319,16 @@ static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_svm, vcpu);
}
+/*
+ * Only the PDPTRs are loaded on demand into the shadow MMU. All other
+ * fields are synchronized in handle_exit, because accessing the VMCB is cheap.
+ *
+ * CR3 might be out of date in the VMCB but it is not marked dirty; instead,
+ * KVM_REQ_LOAD_MMU_PGD is always requested when the cached vcpu->arch.cr3
+ * is changed. svm_load_mmu_pgd() then syncs the new CR3 value into the VMCB.
+ */
+#define SVM_REGS_LAZY_LOAD_SET (1 << VCPU_EXREG_PDPTR)
+
static inline void vmcb_set_intercept(struct vmcb_control_area *control, u32 bit)
{
WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
@@ -302,6 +347,12 @@ static inline bool vmcb_is_intercept(struct vmcb_control_area *control, u32 bit)
return test_bit(bit, (unsigned long *)&control->intercepts);
}
+static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u32 bit)
+{
+ WARN_ON_ONCE(bit >= 32 * MAX_INTERCEPT);
+ return test_bit(bit, (unsigned long *)&control->intercepts);
+}
+
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb01.ptr;
@@ -454,22 +505,22 @@ static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu)
static inline bool nested_exit_on_smi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SMI);
}
static inline bool nested_exit_on_intr(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INTR);
}
static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
{
- return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
+ return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
-void svm_leave_nested(struct vcpu_svm *svm);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);
@@ -493,8 +544,10 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
-void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
- struct vmcb_control_area *control);
+void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
+ struct vmcb_control_area *control);
+void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
+ struct vmcb_save_area *save);
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm);
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
@@ -514,17 +567,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 *entry = svm->avic_physical_id_cache;
-
- if (!entry)
- return false;
-
- return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -545,8 +587,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index c53b8bf8d013..489ca56212c6 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
}
static inline void svm_hv_hardware_setup(void)
@@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct hv_enlightenments *hve =
(struct hv_enlightenments *)vmcb->control.reserved_sw;
- /*
- * vmcb can be NULL if called during early vcpu init.
- * And its okay not to mark vmcb dirty during vcpu init
- * as we mark it dirty unconditionally towards end of vcpu
- * init phase.
- */
- if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
- hve->hv_enlightenments_control.msr_bitmap)
+ if (hve->hv_enlightenments_control.msr_bitmap)
vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
}
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 4fa17df123cd..dfaeb47fcf2a 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -148,7 +148,7 @@ SYM_FUNC_START(__svm_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
3: cmpb $0, kvm_rebooting
jne 2b
@@ -202,7 +202,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
3: cmpb $0, kvm_rebooting
jne 2b
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 953b0fcb21ee..92e6f6702f00 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1356,6 +1356,30 @@ TRACE_EVENT(kvm_apicv_update_request,
__entry->bit)
);
+TRACE_EVENT(kvm_apicv_accept_irq,
+ TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( __u16, dm )
+ __field( __u16, tm )
+ __field( __u8, vec )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apicid;
+ __entry->dm = dm;
+ __entry->tm = tm;
+ __entry->vec = vec;
+ ),
+
+ TP_printk("apicid %x vec %u (%s|%s)",
+ __entry->apicid, __entry->vec,
+ __print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
+ __entry->tm ? "level" : "edge")
+);
+
/*
* Tracepoint for AMD AVIC
*/
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4705ad55abb5..3f430e218375 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include "lapic.h"
+#include "x86.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -53,7 +54,6 @@ struct nested_vmx_msrs {
struct vmcs_config {
int size;
- int order;
u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
@@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -380,6 +389,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = 0;
+ if (!enable_pmu)
+ return perf_cap;
+
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index ba6f99f584ac..87e3dc10edf4 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -12,8 +12,6 @@
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
-#if IS_ENABLED(CONFIG_HYPERV)
-
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
@@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
+#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
case MSR_IA32_VMX_PROCBASED_CTLS2:
ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
break;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 16731d2cf231..8d70f9aea94b 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
-#if IS_ENABLED(CONFIG_HYPERV)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -73,26 +73,56 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-static __always_inline int get_evmcs_offset(unsigned long field,
- u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
{
unsigned int index = ROL16(field, 6);
const struct evmcs_field *evmcs_field;
- if (unlikely(index >= nr_evmcs_1_fields)) {
- WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
+ if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;
- }
evmcs_field = &vmcs_field_to_evmcs_1[index];
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
}
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+
+ return offset;
+}
+
static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e2f66951566..ba34e94049c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,6 +7,7 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
+#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -245,7 +246,8 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
src = &prev->host_state;
dest = &vmx->loaded_vmcs->host_state;
- vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel,
+ src->fs_base, src->gs_base);
dest->ldt_sel = src->ldt_sel;
#ifdef CONFIG_X86_64
dest->ds_sel = src->ds_sel;
@@ -269,7 +271,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
}
/*
@@ -391,9 +399,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.msrs.ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT,
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
nested_ept_ad_enabled(vcpu),
nested_ept_get_eptp(vcpu));
}
@@ -591,6 +601,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
@@ -598,6 +609,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
+ evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+
if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
@@ -664,6 +688,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
+ vmx->nested.force_msr_bitmap_recalc = false;
+
return true;
}
@@ -1095,7 +1121,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
* must not be dereferenced.
*/
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
- CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+ CC(!load_pdptrs(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
}
@@ -1104,7 +1130,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -1162,29 +1188,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -2024,10 +2047,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
- if (from_launch || evmcs_gpa_changed)
+ if (from_launch || evmcs_gpa_changed) {
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
return EVMPTRLD_SUCCEEDED;
}
@@ -2594,8 +2620,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
@@ -3028,7 +3056,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3051,12 +3079,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -3146,7 +3168,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
}
@@ -3344,8 +3366,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -3506,10 +3527,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
- return nested_vmx_failInvalid(vcpu);
}
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
@@ -3605,7 +3629,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
break;
case GUEST_ACTIVITY_WAIT_SIPI:
@@ -4502,9 +4526,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4829,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/*
- * We should allocate a shadow vmcs for vmcs01 only when L1
- * executes VMXON and free it when L1 executes VMXOFF.
- * As it is invalid to execute VMXON twice, we shouldn't reach
- * here when vmcs01 already have an allocated shadow vmcs.
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
*/
- WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
- if (!loaded_vmcs->shadow_vmcs) {
- loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
- if (loaded_vmcs->shadow_vmcs)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- }
return loaded_vmcs->shadow_vmcs;
}
@@ -4857,6 +4882,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5076,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- /*
- * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
- * any VMREAD sets the ALU flags for VMfailInvalid.
- */
- if (vmx->nested.current_vmptr == INVALID_GPA ||
- (is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
- return nested_vmx_failInvalid(vcpu);
-
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 value */
- value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ }
/*
* Now copy part of this value to register or memory, as requested.
@@ -5191,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
+ offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -5260,6 +5308,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
}
/* Emulate the VMPTRLD instruction */
@@ -5289,8 +5338,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
- if (ghc->gpa != vmptr &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -6396,6 +6444,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
goto error_guest_mode;
@@ -6438,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
max_idx = 0;
for (i = 0; i < nr_vmcs12_fields; i++) {
/* The vmcs12 table is very, very sparsely populated. */
- if (!vmcs_field_to_offset_table[i])
+ if (!vmcs12_field_offsets[i])
continue;
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@@ -6747,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
}
struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.triple_fault = nested_vmx_triple_fault,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 1b7456b2177b..466d18fc0c5d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ /* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
@@ -68,34 +68,29 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
reprogram_counter(pmu, bit);
}
-static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
- if (intel_arch_events[i].eventsel == event_select
- && intel_arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+ if (intel_arch_events[i].eventsel != event_select ||
+ intel_arch_events[i].unit_mask != unit_mask)
+ continue;
- return intel_arch_events[i].event_type;
-}
+ /* disable event that reported as not present by cpuid */
+ if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+ return PERF_COUNT_HW_MAX + 1;
-static unsigned intel_find_fixed_event(int idx)
-{
- u32 event;
- size_t size = ARRAY_SIZE(fixed_pmc_events);
+ break;
+ }
- if (idx >= size)
+ if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX;
- event = fixed_pmc_events[array_index_nospec(idx, size)];
- return intel_arch_events[event].event_type;
+ return intel_arch_events[i].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -459,6 +454,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+{
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+ struct kvm_pmc *pmc;
+ u32 event;
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ event = fixed_pmc_events[array_index_nospec(i, size)];
+ pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
+ intel_arch_events[event].eventsel;
+ }
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -477,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
+ if (!entry || !enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
@@ -500,12 +510,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters = 0;
} else {
pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- x86_pmu.num_counters_fixed);
+ min3(ARRAY_SIZE(fixed_pmc_events),
+ (size_t) edx.split.num_counters_fixed,
+ (size_t) x86_pmu.num_counters_fixed);
edx.split.bit_width_fixed = min_t(int,
edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
+ setup_fixed_pmc_eventsel(pmu);
}
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
@@ -703,8 +715,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops intel_pmu_ops = {
- .find_arch_event = intel_find_arch_event,
- .find_fixed_event = intel_find_fixed_event,
+ .pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd4..aa1fe9085d77 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,63 +5,115 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
/*
- * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
*/
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vmx(vcpu)->pi_desc);
}
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (cmpxchg64(&pi_desc->control, old, new) != old)
+ return -EBUSY;
+
+ return 0;
+}
+
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
+ unsigned long flags;
unsigned int dest;
/*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
*/
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
/*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed.
*/
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ /*
+ * Clear SN if it was set due to being preempted. Again, do
+ * this even if there is no assigned device for simplicity.
+ */
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
+ return;
}
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
- dest = cpu_physical_id(cpu);
+ /*
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_del(&vmx->pi_wakeup_list);
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ }
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
+
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
+ new.ndst = dest;
new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
+
+ local_irq_restore(flags);
after_clear_sn:
@@ -77,130 +129,71 @@ after_clear_sn:
pi_set_on(pi_desc);
}
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
}
/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
*/
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
- struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vmx->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
+ WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
+ local_irq_restore(flags);
}
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
- if (vcpu->pre_pcpu == -1)
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
+ if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ pi_enable_wakeup_handler(vcpu);
+
+ /*
+ * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
+ * as blocking and preempted, e.g. if it's preempted between setting
+ * its wait state and manually scheduling out.
+ */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
/*
@@ -208,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/
void pi_wakeup_handler(void)
{
- struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
+ struct vcpu_vmx *vmx;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+ list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
+ pi_wakeup_list) {
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
+ if (pi_test_on(&vmx->pi_desc))
+ kvm_vcpu_wake_up(&vmx->vcpu);
}
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
void __init pi_init_cpu(int cpu)
{
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -241,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
*/
void vmx_pi_start_assignment(struct kvm *kvm)
{
@@ -270,9 +262,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 7f7b2326caf5..eb14e76b84ef 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,7 +40,13 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
@@ -74,13 +80,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_on(struct pi_desc *pi_desc)
+static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_sn(struct pi_desc *pi_desc)
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
@@ -88,8 +94,6 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6e5de2e2b0da..e325c290a816 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info)
return is_exception_n(intr_info, MC_VECTOR);
}
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index cab6ba7a5005..2251b60920f8 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -8,7 +8,7 @@
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 2a45f026ee11..746129ddd5ae 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
{
unsigned short offset;
unsigned int index;
@@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
return -ENOENT;
index = array_index_nospec(index, nr_vmcs12_fields);
- offset = vmcs_field_to_offset_table[index];
+ offset = vmcs12_field_offsets[index];
if (offset == 0)
return -ENOENT;
return offset;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 3a6461694fc2..435c187927c4 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter)
je 2f
1: vmresume
- ret
+ RET
2: vmlaunch
- ret
+ RET
3: cmpb $0, kvm_rebooting
je 4f
- ret
+ RET
4: ud2
_ASM_EXTABLE(1b, 3b)
@@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit)
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
- ret
+ RET
SYM_FUNC_END(vmx_vmexit)
/**
@@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
@@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline)
pop %_ASM_AX
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmread_error_trampoline)
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
@@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..aca3ae2a02f3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
@@ -161,6 +162,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_FS_BASE,
MSR_GS_BASE,
MSR_KERNEL_GS_BASE,
+ MSR_IA32_XFD,
+ MSR_IA32_XFD_ERR,
#endif
MSR_IA32_SYSENTER_CS,
MSR_IA32_SYSENTER_ESP,
@@ -602,15 +605,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
@@ -763,6 +764,14 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -1071,9 +1080,14 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base)
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
{
+ if (unlikely(cr3 != host->cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ host->cr3 = cr3;
+ }
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
@@ -1168,7 +1182,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
+ fs_sel, gs_sel, fs_base, gs_base);
+
vmx->guest_state_loaded = true;
}
@@ -1271,7 +1287,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1287,8 +1302,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
@@ -1363,6 +1381,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -1464,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
- * not point tthe failing instruction, and even if it did, the code
+ * not point at the failing instruction, and even if it did, the code
* stream is inaccessible. Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
@@ -1748,7 +1772,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1955,6 +1979,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
@@ -2095,9 +2137,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
@@ -2565,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
@@ -2590,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, flags, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2609,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -2646,15 +2684,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2918,6 +2947,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2966,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -2984,7 +3018,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
@@ -3066,6 +3100,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
}
/* depends on vcpu->arch.cr0 to be set to a new value */
@@ -3109,9 +3150,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
guest_cr3 = vcpu->arch.cr3;
- else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
@@ -3122,6 +3163,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
+
static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
@@ -3686,6 +3728,19 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
+{
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3694,8 +3749,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3739,8 +3793,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3878,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu);
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
{
#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
if (vcpu->mode == IN_GUEST_MODE) {
/*
* The vector of interrupt to be delivered to vcpu had
@@ -3911,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ return;
}
#endif
- return false;
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3930,9 +3986,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -3963,10 +4031,13 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
- kvm_vcpu_kick(vcpu);
-
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
return 0;
}
@@ -4021,6 +4092,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@@ -4039,8 +4120,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
~vcpu->arch.cr4_guest_rsvd_bits;
- if (!enable_ept)
- vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
vcpu->arch.cr4_guest_owned_bits &=
~get_vmcs12(vcpu)->cr4_guest_host_mask;
@@ -4692,7 +4775,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4748,6 +4831,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info) || is_nmi(intr_info))
return 1; /* handled by handle_exception_nmi_irqoff() */
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception(vcpu, NM_VECTOR);
+ return 1;
+ }
+
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
@@ -4811,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Note, skipping ICEBP also
+ * clears STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -5307,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -5336,6 +5455,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending;
+}
+
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5355,15 +5482,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending) {
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
/*
@@ -5378,6 +5504,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5881,18 +6017,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
@@ -5914,10 +6046,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -6262,9 +6414,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6426,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, this may cause a vmexit or it may
- * be injected into L2. Either way, this interrupt will be
- * processed via KVM_REQ_EVENT, not RVI, because we do not use
- * virtual interrupt delivery to inject L1 interrupts into L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6317,11 +6480,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
unsigned long entry)
{
- kvm_before_interrupt(vcpu);
+ bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
+
+ kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Do not blindly read xfd_err here, since this exception might
+ * be caused by L1 interception on a platform which doesn't
+ * support xfd at all.
+ *
+ * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+ * only when xfd contains a non-zero value.
+ *
+ * Queuing exception is done in vmx_handle_exit. See comment there.
+ */
+ if (vcpu->arch.guest_fpu.fpstate->xfd)
+ rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
@@ -6330,6 +6515,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(&vmx->vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
@@ -6588,7 +6776,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6601,9 +6789,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail.
*/
if (unlikely(vmx->emulation_required)) {
-
- /* We don't emulate invalid state of a nested guest */
- vmx->fail = is_guest_mode(vcpu);
+ vmx->fail = 0;
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1;
@@ -6631,12 +6817,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
+ vcpu->arch.regs_dirty = 0;
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
@@ -6725,7 +6906,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
pt_guest_exit(vmx);
@@ -6793,6 +6974,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
+ INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@@ -6826,6 +7009,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -6931,7 +7127,6 @@ static int __init vmx_check_processor_compat(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
- u64 ipat = 0;
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -6951,30 +7146,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* EPT memory type is used to emulate guest CD/MTRR.
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ }
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+ return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7166,6 +7353,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7405,25 +7597,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
- if (pi_pre_block(vcpu))
- return 1;
-
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
- if (kvm_x86_ops.set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
-
- pi_post_block(vcpu);
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7509,6 +7682,7 @@ static void hardware_unsetup(void)
static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
@@ -7558,12 +7732,14 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current,
.tlb_flush_gva = vmx_flush_tlb_gva,
.tlb_flush_guest = vmx_flush_tlb_guest,
+ .vcpu_pre_run = vmx_vcpu_pre_run,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
@@ -7622,9 +7798,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
@@ -7653,6 +7826,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!kvm_arch_pmi_in_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
static __init void vmx_setup_user_return_msrs(void)
{
@@ -7679,11 +7866,13 @@ static __init void vmx_setup_user_return_msrs(void)
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
+static struct kvm_x86_init_ops vmx_init_ops __initdata;
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, ept_lpage_level;
+ int r;
store_idt(&dt);
host_idt_base = dt.address;
@@ -7761,10 +7950,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
@@ -7780,16 +7969,8 @@ static __init int hardware_setup(void)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
- if (!enable_ept)
- ept_lpage_level = 0;
- else if (cpu_has_vmx_ept_1g_page())
- ept_lpage_level = PG_LEVEL_1G;
- else if (cpu_has_vmx_ept_2m_page())
- ept_lpage_level = PG_LEVEL_2M;
- else
- ept_lpage_level = PG_LEVEL_4K;
kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
- ept_lpage_level);
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7837,6 +8018,10 @@ static __init int hardware_setup(void)
return -EINVAL;
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vmx_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -7865,6 +8050,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.disabled_by_bios = vmx_disabled_by_bios,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
+ .handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
};
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4df2ac24ffc1..7f2c82e7f38f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -159,6 +159,15 @@ struct nested_vmx {
bool dirty_vmcs12;
/*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
* Indicates lazily loaded guest state has not yet been decached from
* vmcs02.
*/
@@ -308,6 +317,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
@@ -340,7 +352,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -362,8 +374,9 @@ int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base);
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
@@ -473,19 +486,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR0)
- | (1 << VCPU_EXREG_CR3)
- | (1 << VCPU_EXREG_CR4)
- | (1 << VCPU_EXREG_EXIT_INFO_1)
- | (1 << VCPU_EXREG_EXIT_INFO_2));
- vcpu->arch.regs_dirty = 0;
-}
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 9e9ef47e988c..5e7f41225780 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -80,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "push $0\n\t"
+ "xorl %k1, %k1\n\t"
+ "2:\n\t"
+ "push %1\n\t"
"push %2\n\t"
- "2:call vmread_error_trampoline\n\t"
+ "call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
@@ -93,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- ".pushsection .fixup, \"ax\"\n\t"
- "4: push $1\n\t"
- "push %2\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 4b)
- : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+
+ : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
}
static __always_inline u16 vmcs_read16(unsigned long field)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5a403d92833f..74b53a16f38a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -118,6 +118,7 @@ static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
@@ -186,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_GPL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -210,7 +216,7 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
- | XFEATURE_MASK_PKRU)
+ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
u64 __read_mostly host_efer;
EXPORT_SYMBOL_GPL(host_efer);
@@ -710,6 +716,17 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_COMPLETE_USER_EXIT);
+}
+
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
@@ -798,8 +815,9 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
/*
* Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
*/
-int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
+int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
gpa_t real_gpa;
int i;
@@ -810,8 +828,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
* If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
* to an L1 GPA.
*/
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn),
- PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
if (real_gpa == UNMAPPED_GVA)
return 0;
@@ -828,8 +846,16 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
}
+ /*
+ * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
+ * Shadow page roots need to be reconstructed instead.
+ */
+ if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
+ kvm_mmu_free_roots(vcpu, mmu, KVM_MMU_ROOT_CURRENT);
+
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
vcpu->arch.pdptrs_from_userspace = false;
return 1;
@@ -856,7 +882,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
cr0 |= X86_CR0_ET;
@@ -886,11 +911,12 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
- is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
- !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
+ !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
- if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!(cr0 & X86_CR0_PG) &&
+ (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -989,6 +1015,11 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
+
+ if ((xcr0 & XFEATURE_MASK_XTILE) &&
+ ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
+ return 1;
+
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
@@ -1050,8 +1081,6 @@ EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP;
if (!kvm_is_valid_cr4(vcpu, cr4))
return 1;
@@ -1062,9 +1091,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if ((cr4 ^ old_cr4) & X86_CR4_LA57)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
- && ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
+ && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
@@ -1153,14 +1181,15 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
- if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
+ if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
return 1;
if (cr3 != kvm_read_cr3(vcpu))
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ /* Do not call post_set_cr3, we do not get here for confidential guests. */
handle_tlb_flush:
/*
@@ -1330,7 +1359,7 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1358,6 +1387,7 @@ static const u32 msrs_to_save_all[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1814,22 +1844,36 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
-static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
{
- int err = vcpu->run->msr.error;
- if (!err) {
+ if (!vcpu->run->msr.error) {
kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
+}
+
+static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
+}
- return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_emulated_msr_access(vcpu);
}
-static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
{
return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
+static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
+{
+ complete_userspace_rdmsr(vcpu);
+ return complete_fast_msr_access(vcpu);
+}
+
static u64 kvm_msr_reason(int r)
{
switch (r) {
@@ -1864,18 +1908,6 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
return 1;
}
-static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r);
-}
-
-static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
-{
- return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_wrmsr, r);
-}
-
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
@@ -1884,18 +1916,16 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
r = kvm_get_msr(vcpu, ecx, &data);
- /* MSR read failed? See if we should ask user space */
- if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
- /* Bounce to user space */
- return 0;
- }
-
if (!r) {
trace_kvm_msr_read(ecx, data);
kvm_rax_write(vcpu, data & -1u);
kvm_rdx_write(vcpu, (data >> 32) & -1u);
} else {
+ /* MSR read failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
+ complete_fast_rdmsr, r))
+ return 0;
trace_kvm_msr_read_ex(ecx);
}
@@ -1911,19 +1941,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
r = kvm_set_msr(vcpu, ecx, data);
- /* MSR write failed? See if we should ask user space */
- if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
- /* Bounce to user space */
- return 0;
-
- /* Signal all other negative errors to userspace */
- if (r < 0)
- return r;
-
- if (!r)
+ if (!r) {
trace_kvm_msr_write(ecx, data);
- else
+ } else {
+ /* MSR write failed? See if we should ask user space */
+ if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+ complete_fast_msr_access, r))
+ return 0;
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
trace_kvm_msr_write_ex(ecx, data);
+ }
return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
@@ -2118,7 +2147,7 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
@@ -2816,7 +2845,7 @@ static void kvm_end_pvclock_update(struct kvm *kvm)
{
struct kvm_arch *ka = &kvm->arch;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
write_seqcount_end(&ka->pvclock_sc);
raw_spin_unlock_irq(&ka->tsc_write_lock);
@@ -3065,7 +3094,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
static void kvmclock_update_fn(struct work_struct *work)
{
- int i;
+ unsigned long i;
struct delayed_work *dwork = to_delayed_work(work);
struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
kvmclock_update_work);
@@ -3258,6 +3287,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
+
+static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
+}
+
+/*
+ * Service "local" TLB flush requests, which are specific to the current MMU
+ * context. In addition to the generic event handling in vcpu_enter_guest(),
+ * TLB flushes that are targeted at an MMU context also need to be serviced
+ * prior before nested VM-Enter/VM-Exit.
+ */
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
+{
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
@@ -3389,7 +3441,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated)
return 1;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent))
+ if (kvm_get_msr_feature(&msr_ent))
return 1;
if (data & ~msr_ent.data)
return 1;
@@ -3483,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -3645,6 +3698,30 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+ vcpu->arch.guest_supported_xcr0))
+ return 1;
+
+ fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ if (data & ~(XFEATURE_MASK_USER_DYNAMIC &
+ vcpu->arch.guest_supported_xcr0))
+ return 1;
+
+ vcpu->arch.guest_fpu.xfd_err = data;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -3965,6 +4042,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_HWCR:
msr_info->data = vcpu->arch.msr_hwcr;
break;
+#ifdef CONFIG_X86_64
+ case MSR_IA32_XFD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
+ break;
+ case MSR_IA32_XFD_ERR:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ return 1;
+
+ msr_info->data = vcpu->arch.guest_fpu.xfd_err;
+ break;
+#endif
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -4133,9 +4226,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_SYS_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4147,7 +4242,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_XEN_HVM:
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
- KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ KVM_XEN_HVM_CONFIG_SHARED_INFO |
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
break;
@@ -4225,11 +4321,61 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
else
r = 0;
break;
+ case KVM_CAP_XSAVE2: {
+ u64 guest_perm = xstate_get_guest_group_perm();
+
+ r = xstate_required_size(supported_xcr0 & guest_perm, false);
+ if (r < sizeof(struct kvm_xsave))
+ r = sizeof(struct kvm_xsave);
+ break;
+ }
default:
break;
}
return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return ERR_PTR(-EFAULT);
+ return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+ if (attr->group)
+ return -ENXIO;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ if (put_user(supported_xcr0, uaddr))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENXIO;
+ break;
+ }
+}
+
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+ if (attr->group)
+ return -ENXIO;
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ return 0;
+ default:
+ return -ENXIO;
+ }
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -4320,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
break;
+ case KVM_GET_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_get_attr(&attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_has_attr(&attr);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -4448,8 +4610,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4759,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
+ }
vcpu->arch.smi_pending = events->smi.pending;
@@ -4829,6 +4992,16 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
vcpu->arch.pkru);
}
+static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
+{
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
+ return;
+
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ state, size, vcpu->arch.pkru);
+}
+
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
@@ -4911,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
int r;
- if ((u64)(unsigned long)uaddr != attr->addr)
- return -EFAULT;
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
@@ -4934,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
struct kvm *kvm = vcpu->kvm;
int r;
- if ((u64)(unsigned long)uaddr != attr->addr)
- return -EFAULT;
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
@@ -5263,6 +5436,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
+ r = -EINVAL;
+ if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
+ break;
+
u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
@@ -5277,7 +5454,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_XSAVE: {
- u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = memdup_user(argp, size);
if (IS_ERR(u.xsave)) {
r = PTR_ERR(u.xsave);
goto out_nofree;
@@ -5286,6 +5465,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
}
+
+ case KVM_GET_XSAVE2: {
+ int size = vcpu->arch.guest_fpu.uabi_size;
+
+ u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, size))
+ break;
+
+ r = 0;
+ break;
+ }
+
case KVM_GET_XCRS: {
u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
@@ -5650,7 +5848,7 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
* VM-Exit.
*/
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
@@ -5698,6 +5896,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
r = 0;
split_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -5918,7 +6117,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -6078,6 +6278,7 @@ set_identity_unlock:
/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
+ kvm_request_apicv_update(kvm, true, APICV_INHIBIT_REASON_ABSENT);
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -6377,6 +6578,11 @@ static void kvm_init_msr_list(void)
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
continue;
break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ continue;
+ break;
default:
break;
}
@@ -6460,13 +6666,14 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
gpa_t t_gpa;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
+ t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
return t_gpa;
}
@@ -6474,25 +6681,31 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
@@ -6500,19 +6713,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6540,13 +6755,14 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
/* Inline kvm_read_guest_virt_helper for speed. */
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
+ exception);
if (unlikely(gpa == UNMAPPED_GVA))
return X86EMUL_PROPAGATE_FAULT;
@@ -6605,13 +6821,12 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
struct kvm_vcpu *vcpu, u32 access,
struct x86_exception *exception)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
- gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
- access,
- exception);
+ gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
@@ -6657,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
+}
+
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@@ -6664,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -6698,6 +6920,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
@@ -6715,7 +6938,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
return 1;
}
- *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+ *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
return -1;
@@ -7077,7 +7300,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count)
{
if (vcpu->arch.pio.count) {
- /* Complete previous iteration. */
+ /*
+ * Complete a previous iteration that required userspace I/O.
+ * Note, @count isn't guaranteed to match pio.count as userspace
+ * can modify ECX before rerunning the vCPU. Ignore any such
+ * shenanigans as KVM doesn't support modifying the rep count,
+ * and the emulator ensures @count doesn't overflow the buffer.
+ */
} else {
int r = __emulator_pio_in(vcpu, size, port, count);
if (!r)
@@ -7086,7 +7315,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */
}
- WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val);
return 1;
}
@@ -7344,7 +7572,8 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
r = kvm_get_msr(vcpu, msr_index, pdata);
- if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -7360,7 +7589,8 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
r = kvm_set_msr(vcpu, msr_index, data);
- if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r)) {
/* Bounce to user space */
return X86EMUL_IO_NEEDED;
}
@@ -7911,6 +8141,8 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+
/*
* rflags is the old, "raw" value of the flags. The new value has
* not been saved yet.
@@ -8030,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -8078,12 +8310,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
/*
- * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
- * for kvm_skip_emulated_instruction(). The caller is responsible for
- * updating interruptibility state and injecting single-step #DBs.
+ * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
+ * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
+ * The caller is responsible for updating interruptibility state and
+ * injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
- kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
+ else
+ ctxt->eip = ctxt->_eip;
+
+ if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
+ r = 1;
+ goto writeback;
+ }
+
+ kvm_rip_write(vcpu, ctxt->eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return 1;
@@ -8147,17 +8390,24 @@ restart:
writeback = false;
r = 0;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
+ } else if (vcpu->arch.complete_userspace_io) {
+ writeback = false;
+ r = 0;
} else if (r == EMULATION_RESTART)
goto restart;
else
r = 1;
+writeback:
if (writeback) {
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ if (ctxt->is_branch)
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -8344,7 +8594,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i, send_ipi = 0;
+ int send_ipi = 0;
+ unsigned long i;
/*
* We allow guests to temporarily run on slowing clocks,
@@ -8469,57 +8720,12 @@ static void kvm_timer_init(void)
kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
-DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
-
-int kvm_is_in_guest(void)
-{
- return __this_cpu_read(current_vcpu) != NULL;
-}
-
-static int kvm_is_user_mode(void)
-{
- int user_mode = 3;
-
- if (__this_cpu_read(current_vcpu))
- user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
-
- return user_mode != 0;
-}
-
-static unsigned long kvm_get_guest_ip(void)
-{
- unsigned long ip = 0;
-
- if (__this_cpu_read(current_vcpu))
- ip = kvm_rip_read(__this_cpu_read(current_vcpu));
-
- return ip;
-}
-
-static void kvm_handle_intel_pt_intr(void)
-{
- struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
-
- kvm_make_request(KVM_REQ_PMI, vcpu);
- __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
- (unsigned long *)&vcpu->arch.pmu.global_status);
-}
-
-static struct perf_guest_info_callbacks kvm_guest_cbs = {
- .is_in_guest = kvm_is_in_guest,
- .is_user_mode = kvm_is_user_mode,
- .get_guest_ip = kvm_get_guest_ip,
- .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
-};
-
#ifdef CONFIG_X86_64
static void pvclock_gtod_update_fn(struct work_struct *work)
{
struct kvm *kvm;
-
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
@@ -8626,8 +8832,6 @@ int kvm_arch_init(void *opaque)
kvm_timer_init();
- perf_register_guest_info_callbacks(&kvm_guest_cbs);
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -8659,7 +8863,6 @@ void kvm_arch_exit(void)
clear_hv_tscchange_cb();
#endif
kvm_lapic_exit();
- perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -8680,8 +8883,15 @@ void kvm_arch_exit(void)
#endif
}
-static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = state;
@@ -8692,11 +8902,11 @@ static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
}
}
-int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
{
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
@@ -8705,7 +8915,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
* KVM_EXIT_DEBUG here.
*/
- return kvm_vcpu_halt(vcpu) && ret;
+ return kvm_emulate_halt_noskip(vcpu) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
@@ -8713,7 +8923,8 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
{
int ret = kvm_skip_emulated_instruction(vcpu);
- return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret;
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
@@ -8776,10 +8987,9 @@ static void kvm_apicv_init(struct kvm *kvm)
{
init_rwsem(&kvm->arch.apicv_update_lock);
- if (enable_apicv)
- clear_bit(APICV_INHIBIT_REASON_DISABLE,
- &kvm->arch.apicv_inhibit_reasons);
- else
+ set_bit(APICV_INHIBIT_REASON_ABSENT,
+ &kvm->arch.apicv_inhibit_reasons);
+ if (!enable_apicv)
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
@@ -8952,14 +9162,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
- /*
- * if_flag is obsolete and useless, so do not bother
- * setting it for SEV-ES guests. Userspace can just
- * use kvm_run->ready_for_interrupt_injection.
- */
- kvm_run->if_flag = !vcpu->arch.guest_state_protected
- && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
-
+ kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
@@ -9528,8 +9731,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9573,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
return;
@@ -9648,10 +9850,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
- kvm_vcpu_flush_tlb_guest(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9801,11 +10000,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_mb__after_srcu_read_unlock();
/*
- * This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick.
+ * Process pending posted interrupts to handle the case where the
+ * notification IRQ arrived in the host, or was never sent (because the
+ * target vCPU wasn't running). Do this regardless of the vCPU's APICv
+ * status, KVM doesn't update assigned devices when APICv is inhibited,
+ * i.e. they can post interrupts even if APICv is temporarily disabled.
*/
- if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9826,6 +10028,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -9849,8 +10054,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ if (kvm_lapic_enabled(vcpu))
+ static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
@@ -9887,8 +10092,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ /*
+ * Sync xfd before calling handle_exit_irqoff() which may
+ * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
+ * in #NM irqoff handler).
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ fpu_sync_guest_vmexit_xfd_state();
+
static_call(kvm_x86_handle_exit_irqoff)(vcpu);
+ if (vcpu->arch.guest_fpu.xfd_err)
+ wrmsrl(MSR_IA32_XFD_ERR, 0);
+
/*
* Consume any pending interrupts, including the possible source of
* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
@@ -9896,7 +10112,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* interrupts on processors that implement an interrupt shadow, the
* stat.exits increment will do nicely.
*/
- kvm_before_interrupt(vcpu);
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
local_irq_enable();
++vcpu->stat.exits;
local_irq_disable();
@@ -9953,14 +10169,29 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
+ bool hv_timer;
+
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ /*
+ * Switch to the software timer before halt-polling/blocking as
+ * the guest's timer may be a break event for the vCPU, and the
+ * hypervisor timer runs only when the CPU is in guest mode.
+ * Switch before halt-polling so that KVM recognizes an expired
+ * timer before blocking.
+ */
+ hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- kvm_vcpu_block(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ kvm_vcpu_halt(vcpu);
+ else
+ kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_x86_ops.post_block)
- static_call(kvm_x86_post_block)(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -10153,6 +10384,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = -EINTR;
goto out;
}
+ /*
+ * It should be impossible for the hypervisor timer to be in
+ * use before KVM has ever run the vCPU.
+ */
+ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
kvm_vcpu_block(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@@ -10197,10 +10433,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- if (kvm_run->immediate_exit)
+ if (kvm_run->immediate_exit) {
r = -EINTR;
- else
- r = vcpu_run(vcpu);
+ goto out;
+ }
+
+ r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ if (r <= 0)
+ goto out;
+
+ r = vcpu_run(vcpu);
out:
kvm_put_guest_fpu(vcpu);
@@ -10516,7 +10758,8 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
vcpu->arch.cr2 = sregs->cr2;
*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+ static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -10533,7 +10776,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
if (update_pdptrs) {
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ load_pdptrs(vcpu, kvm_read_cr3(vcpu));
*mmu_reset_needed = 1;
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -10631,7 +10874,7 @@ static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
{
bool inhibit = false;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
down_write(&kvm->arch.apicv_update_lock);
@@ -11035,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.msr_misc_features_enables = 0;
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+ __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
}
/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@@ -11052,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
- vcpu->arch.ia32_xss = 0;
-
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
@@ -11119,7 +11361,7 @@ int kvm_arch_hardware_enable(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
int ret;
u64 local_tsc;
u64 max_tsc = 0;
@@ -11229,6 +11471,8 @@ int kvm_arch_hardware_setup(void *opaque)
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
kvm_ops_static_call_update();
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -11256,6 +11500,8 @@ int kvm_arch_hardware_setup(void *opaque)
void kvm_arch_hardware_unsetup(void)
{
+ kvm_unregister_perf_callbacks();
+
static_call(kvm_x86_hardware_unsetup)();
}
@@ -11372,7 +11618,7 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
static void kvm_free_vcpus(struct kvm *kvm)
{
- unsigned int i;
+ unsigned long i;
struct kvm_vcpu *vcpu;
/*
@@ -11382,15 +11628,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
}
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_destroy(vcpu);
-
- mutex_lock(&kvm->lock);
- for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
- kvm->vcpus[i] = NULL;
- atomic_set(&kvm->online_vcpus, 0);
- mutex_unlock(&kvm->lock);
+ kvm_destroy_vcpus(kvm);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -11558,9 +11797,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
}
static int kvm_alloc_memslot_metadata(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- unsigned long npages)
+ struct kvm_memory_slot *slot)
{
+ unsigned long npages = slot->npages;
int i, r;
/*
@@ -11625,7 +11864,7 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
struct kvm_vcpu *vcpu;
- int i;
+ unsigned long i;
/*
* memslots->generation has been incremented.
@@ -11639,13 +11878,18 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- const struct kvm_userspace_memory_region *mem,
- enum kvm_mr_change change)
+ const struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(kvm, memslot,
- mem->memory_size >> PAGE_SHIFT);
+ return kvm_alloc_memslot_metadata(kvm, new);
+
+ if (change == KVM_MR_FLAGS_ONLY)
+ memcpy(&new->arch, &old->arch, sizeof(old->arch));
+ else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
+ return -EIO;
+
return 0;
}
@@ -11669,13 +11913,15 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+ u32 old_flags = old ? old->flags : 0;
+ u32 new_flags = new ? new->flags : 0;
+ bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
/*
* Update CPU dirty logging if dirty logging is being toggled. This
* applies to all operations.
*/
- if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
@@ -11693,7 +11939,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot().
*/
- if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
return;
/*
@@ -11701,7 +11947,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
* logging isn't being toggled on or off.
*/
- if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
return;
if (!log_dirty_pages) {
@@ -11737,14 +11983,18 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot *old,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- if (!kvm->arch.n_requested_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm,
- kvm_mmu_calculate_default_mmu_pages(kvm));
+ if (!kvm->arch.n_requested_mmu_pages &&
+ (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
+ unsigned long nr_mmu_pages;
+
+ nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
kvm_mmu_slot_apply_flags(kvm, old, new, change);
@@ -11845,6 +12095,11 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
return vcpu->arch.preempted_in_kernel;
}
+unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
+{
+ return kvm_rip_read(vcpu);
+}
+
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
@@ -12254,12 +12509,13 @@ EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
+ struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
struct x86_exception fault;
u32 access = error_code &
(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
if (!(error_code & PFERR_PRESENT_MASK) ||
- vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+ mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != UNMAPPED_GVA) {
/*
* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
* tables probably do not match the TLB. Just proceed
@@ -12596,6 +12852,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 997669ae9caa..635b75f9e145 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- static_call(kvm_x86_tlb_flush_current)(vcpu);
-}
-
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -306,7 +301,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
-void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -342,6 +336,7 @@ extern u64 host_xcr0;
extern u64 supported_xcr0;
extern u64 host_xss;
extern u64 supported_xss;
+extern bool enable_pmu;
static inline bool kvm_mpx_supported(void)
{
@@ -397,18 +392,27 @@ static inline bool kvm_cstate_in_guest(struct kvm *kvm)
return kvm->arch.cstate_in_guest;
}
-DECLARE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu)
+static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
- __this_cpu_write(current_vcpu, vcpu);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
- __this_cpu_write(current_vcpu, NULL);
+ WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
+static inline bool kvm_handling_nmi_from_guest(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.handling_intr_from_guest == KVM_HANDLING_NMI;
+}
static inline bool kvm_pat_valid(u64 data)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index dff2bdf9507a..bad57535fad0 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -16,6 +16,7 @@
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/event_channel.h>
#include "trace.h"
@@ -23,38 +24,77 @@ DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct pvclock_wall_clock *wc;
gpa_t gpa = gfn_to_gpa(gfn);
- int wc_ofs, sec_hi_ofs;
+ u32 *wc_sec_hi;
+ u32 wc_version;
+ u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
- ret = -EFAULT;
+ if (gfn == GPA_INVALID) {
+ kvm_gfn_to_pfn_cache_destroy(kvm, gpc);
goto out;
}
- kvm->arch.xen.shinfo_gfn = gfn;
+
+ do {
+ ret = kvm_gfn_to_pfn_cache_init(kvm, gpc, NULL, false, true,
+ gpa, PAGE_SIZE, false);
+ if (ret)
+ goto out;
+
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
+
+ /* It could be invalid again already, so we need to check */
+ read_lock_irq(&gpc->lock);
+
+ if (gpc->valid)
+ break;
+
+ read_unlock_irq(&gpc->lock);
+ } while (1);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
- /* 32-bit location by default */
- wc_ofs = offsetof(struct compat_shared_info, wc);
- sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
-
#ifdef CONFIG_X86_64
/* Paranoia checks on the 64-bit struct layout */
BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
- if (kvm->arch.xen.long_mode) {
- wc_ofs = offsetof(struct shared_info, wc);
- sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
- }
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->wc_sec_hi;
+ wc = &shinfo->wc;
+ } else
#endif
+ {
+ struct compat_shared_info *shinfo = gpc->khva;
+
+ wc_sec_hi = &shinfo->arch.wc_sec_hi;
+ wc = &shinfo->wc;
+ }
+
+ /* Increment and ensure an odd value */
+ wc_version = wc->version = (wc->version + 1) | 1;
+ smp_wmb();
+
+ wc->nsec = do_div(wall_nsec, 1000000000);
+ wc->sec = (u32)wall_nsec;
+ *wc_sec_hi = wall_nsec >> 32;
+ smp_wmb();
+
+ wc->version = wc_version + 1;
+ read_unlock_irq(&gpc->lock);
- kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
out:
@@ -190,6 +230,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
+ unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
+ bool atomic = in_atomic() || !task_is_running(current);
int err;
u8 rc = 0;
@@ -199,6 +241,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
*/
struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ bool ghc_valid = slots->generation == ghc->generation &&
+ !kvm_is_error_hva(ghc->hva) && ghc->memslot;
+
unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
/* No need for compat handling here */
@@ -214,8 +259,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
* cache in kvm_read_guest_offset_cached(), but just uses
* __get_user() instead. And falls back to the slow path.
*/
- if (likely(slots->generation == ghc->generation &&
- !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+ if (!evtchn_pending_sel && ghc_valid) {
/* Fast path */
pagefault_disable();
err = __get_user(rc, (u8 __user *)ghc->hva + offset);
@@ -234,11 +278,76 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
* and we'll end up getting called again from a context where we *can*
* fault in the page and wait for it.
*/
- if (in_atomic() || !task_is_running(current))
+ if (atomic)
return 1;
- kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
- sizeof(rc));
+ if (!ghc_valid) {
+ err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
+ if (err || !ghc->memslot) {
+ /*
+ * If this failed, userspace has screwed up the
+ * vcpu_info mapping. No interrupts for you.
+ */
+ return 0;
+ }
+ }
+
+ /*
+ * Now we have a valid (protected by srcu) userspace HVA in
+ * ghc->hva which points to the struct vcpu_info. If there
+ * are any bits in the in-kernel evtchn_pending_sel then
+ * we need to write those to the guest vcpu_info and set
+ * its evtchn_upcall_pending flag. If there aren't any bits
+ * to add, we only want to *check* evtchn_upcall_pending.
+ */
+ if (evtchn_pending_sel) {
+ bool long_mode = v->kvm->arch.xen.long_mode;
+
+ if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
+ struct vcpu_info __user *vi = (void __user *)ghc->hva;
+
+ /* Attempt to set the evtchn_pending_sel bits in the
+ * guest, and if that succeeds then clear the same
+ * bits in the in-kernel version. */
+ asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
+ "\tnotq %0\n"
+ "\t" LOCK_PREFIX "andq %0, %2\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ } else {
+ struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+
+ /* Attempt to set the evtchn_pending_sel bits in the
+ * guest, and if that succeeds then clear the same
+ * bits in the in-kernel version. */
+ asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
+ "\tnotl %0\n"
+ "\t" LOCK_PREFIX "andl %0, %2\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ }
+ rc = 1;
+ unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
+
+ err:
+ user_access_end();
+
+ mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+ } else {
+ __get_user(rc, (u8 __user *)ghc->hva + offset);
+ }
return rc;
}
@@ -260,15 +369,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (data->u.shared_info.gfn == GPA_INVALID) {
- kvm->arch.xen.shinfo_gfn = GPA_INVALID;
- r = 0;
- break;
- }
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
break;
-
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -299,7 +402,10 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- data->u.shared_info.gfn = kvm->arch.xen.shinfo_gfn;
+ if (kvm->arch.xen.shinfo_cache.active)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = GPA_INVALID;
r = 0;
break;
@@ -661,11 +767,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
void kvm_xen_init_vm(struct kvm *kvm)
{
- kvm->arch.xen.shinfo_gfn = GPA_INVALID;
}
void kvm_xen_destroy_vm(struct kvm *kvm)
{
+ kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
if (kvm->arch.xen_hvm_config.msr)
static_branch_slow_dec_deferred(&kvm_xen_enabled);
}
@@ -737,3 +844,179 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
+
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
+/*
+ * This follows the kvm_set_irq() API, so it returns:
+ * < 0 Interrupt was ignored (masked or not delivered for other reasons)
+ * = 0 Interrupt was coalesced (previous irq is still pending)
+ * > 0 Number of CPUs interrupt was delivered to
+ */
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ struct kvm_vcpu *vcpu;
+ unsigned long *pending_bits, *mask_bits;
+ unsigned long flags;
+ int port_word_bit;
+ bool kick_vcpu = false;
+ int idx;
+ int rc;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
+ if (!vcpu)
+ return -1;
+
+ if (!vcpu->arch.xen.vcpu_info_set)
+ return -1;
+
+ if (e->xen_evtchn.port >= max_evtchn_port(kvm))
+ return -1;
+
+ rc = -EWOULDBLOCK;
+ read_lock_irqsave(&gpc->lock, flags);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = e->xen_evtchn.port / 64;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ mask_bits = (unsigned long *)&shinfo->evtchn_mask;
+ port_word_bit = e->xen_evtchn.port / 32;
+ }
+
+ /*
+ * If this port wasn't already set, and if it isn't masked, then
+ * we try to set the corresponding bit in the in-kernel shadow of
+ * evtchn_pending_sel for the target vCPU. And if *that* wasn't
+ * already set, then we kick the vCPU in question to write to the
+ * *real* evtchn_pending_sel in its own guest vcpu_info struct.
+ */
+ if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+ rc = 0; /* It was already raised */
+ } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
+ rc = -1; /* Masked */
+ } else {
+ rc = 1; /* Delivered. But was the vCPU waking already? */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ }
+
+ out_rcu:
+ srcu_read_unlock(&kvm->srcu, idx);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kick_vcpu) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ return rc;
+}
+
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ bool mm_borrowed = false;
+ int rc;
+
+ if (!level)
+ return -1;
+
+ rc = kvm_xen_set_evtchn_fast(e, kvm);
+ if (rc != -EWOULDBLOCK)
+ return rc;
+
+ if (current->mm != kvm->mm) {
+ /*
+ * If not on a thread which already belongs to this KVM,
+ * we'd better be in the irqfd workqueue.
+ */
+ if (WARN_ON_ONCE(current->mm))
+ return -EINVAL;
+
+ kthread_use_mm(kvm->mm);
+ mm_borrowed = true;
+ }
+
+ /*
+ * For the irqfd workqueue, using the main kvm->lock mutex is
+ * fine since this function is invoked from kvm_set_irq() with
+ * no other lock held, no srcu. In future if it will be called
+ * directly from a vCPU thread (e.g. on hypercall for an IPI)
+ * then it may need to switch to using a leaf-node mutex for
+ * serializing the shared_info mapping.
+ */
+ mutex_lock(&kvm->lock);
+
+ /*
+ * It is theoretically possible for the page to be unmapped
+ * and the MMU notifier to invalidate the shared_info before
+ * we even get to use it. In that case, this looks like an
+ * infinite loop. It was tempting to do it via the userspace
+ * HVA instead... but that just *hides* the fact that it's
+ * an infinite loop, because if a fault occurs and it waits
+ * for the page to come back, it can *still* immediately
+ * fault and have to wait again, repeatedly.
+ *
+ * Conversely, the page could also have been reinstated by
+ * another thread before we even obtain the mutex above, so
+ * check again *first* before remapping it.
+ */
+ do {
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ int idx;
+
+ rc = kvm_xen_set_evtchn_fast(e, kvm);
+ if (rc != -EWOULDBLOCK)
+ break;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa,
+ PAGE_SIZE, false);
+ srcu_read_unlock(&kvm->srcu, idx);
+ } while(!rc);
+
+ mutex_unlock(&kvm->lock);
+
+ if (mm_borrowed)
+ kthread_unuse_mm(kvm->mm);
+
+ return rc;
+}
+
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+
+{
+ if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e->xen_evtchn.port = ue->u.xen_evtchn.port;
+ e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
+ e->set = evtchn_set_fn;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index cc0cf5f37450..adbcc9ed59db 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
+int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm);
+int kvm_xen_setup_evtchn(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue);
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -134,6 +140,9 @@ struct compat_shared_info {
struct compat_arch_shared_info arch;
};
+#define COMPAT_EVTCHN_2L_NR_CHANNELS (8 * \
+ sizeof_field(struct compat_shared_info, \
+ evtchn_pending))
struct compat_vcpu_runstate_info {
int state;
uint64_t state_entry_time;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index c6506c6a7092..f76747862bd2 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -63,7 +63,6 @@ ifeq ($(CONFIG_X86_32),y)
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
- lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 16bc9130e7a5..e768815e58ae 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -9,81 +9,83 @@
#include <asm/alternative.h>
/* if you want SMP support, implement these with real spinlocks */
-.macro LOCK reg
+.macro IRQ_SAVE reg
pushfl
cli
.endm
-.macro UNLOCK reg
+.macro IRQ_RESTORE reg
popfl
.endm
-#define BEGIN(op) \
+#define BEGIN_IRQ_SAVE(op) \
.macro endp; \
SYM_FUNC_END(atomic64_##op##_386); \
.purgem endp; \
.endm; \
SYM_FUNC_START(atomic64_##op##_386); \
- LOCK v;
+ IRQ_SAVE v;
#define ENDP endp
-#define RET \
- UNLOCK v; \
- ret
-
-#define RET_ENDP \
- RET; \
- ENDP
+#define RET_IRQ_RESTORE \
+ IRQ_RESTORE v; \
+ RET
#define v %ecx
-BEGIN(read)
+BEGIN_IRQ_SAVE(read)
movl (v), %eax
movl 4(v), %edx
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(set)
+BEGIN_IRQ_SAVE(set)
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(xchg)
+BEGIN_IRQ_SAVE(xchg)
movl (v), %eax
movl 4(v), %edx
movl %ebx, (v)
movl %ecx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add)
+BEGIN_IRQ_SAVE(add)
addl %eax, (v)
adcl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(add_return)
+BEGIN_IRQ_SAVE(add_return)
addl (v), %eax
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub)
+BEGIN_IRQ_SAVE(sub)
subl %eax, (v)
sbbl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %ecx
-BEGIN(sub_return)
+BEGIN_IRQ_SAVE(sub_return)
negl %edx
negl %eax
sbbl $0, %edx
@@ -91,47 +93,52 @@ BEGIN(sub_return)
adcl 4(v), %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc)
+BEGIN_IRQ_SAVE(inc)
addl $1, (v)
adcl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(inc_return)
+BEGIN_IRQ_SAVE(inc_return)
movl (v), %eax
movl 4(v), %edx
addl $1, %eax
adcl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec)
+BEGIN_IRQ_SAVE(dec)
subl $1, (v)
sbbl $0, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(dec_return)
+BEGIN_IRQ_SAVE(dec_return)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
sbbl $0, %edx
movl %eax, (v)
movl %edx, 4(v)
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
#define v %esi
-BEGIN(add_unless)
+BEGIN_IRQ_SAVE(add_unless)
addl %eax, %ecx
adcl %edx, %edi
addl (v), %eax
@@ -143,7 +150,7 @@ BEGIN(add_unless)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
cmpl %edx, %edi
jne 1b
@@ -153,7 +160,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(inc_not_zero)
+BEGIN_IRQ_SAVE(inc_not_zero)
movl (v), %eax
movl 4(v), %edx
testl %eax, %eax
@@ -165,7 +172,7 @@ BEGIN(inc_not_zero)
movl %edx, 4(v)
movl $1, %eax
2:
- RET
+ RET_IRQ_RESTORE
3:
testl %edx, %edx
jne 1b
@@ -174,7 +181,7 @@ ENDP
#undef v
#define v %esi
-BEGIN(dec_if_positive)
+BEGIN_IRQ_SAVE(dec_if_positive)
movl (v), %eax
movl 4(v), %edx
subl $1, %eax
@@ -183,5 +190,6 @@ BEGIN(dec_if_positive)
movl %eax, (v)
movl %edx, 4(v)
1:
-RET_ENDP
+ RET_IRQ_RESTORE
+ENDP
#undef v
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index ce6935690766..90afb488b396 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -18,7 +18,7 @@
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
- ret
+ RET
SYM_FUNC_END(atomic64_read_cx8)
SYM_FUNC_START(atomic64_set_cx8)
@@ -28,7 +28,7 @@ SYM_FUNC_START(atomic64_set_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_set_cx8)
SYM_FUNC_START(atomic64_xchg_cx8)
@@ -37,7 +37,7 @@ SYM_FUNC_START(atomic64_xchg_cx8)
cmpxchg8b (%esi)
jne 1b
- ret
+ RET
SYM_FUNC_END(atomic64_xchg_cx8)
.macro addsub_return func ins insc
@@ -68,7 +68,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
popl %esi
popl %ebx
popl %ebp
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -93,7 +93,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_\func\()_return_cx8)
.endm
@@ -118,7 +118,7 @@ SYM_FUNC_START(atomic64_dec_if_positive_cx8)
movl %ebx, %eax
movl %ecx, %edx
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_dec_if_positive_cx8)
SYM_FUNC_START(atomic64_add_unless_cx8)
@@ -149,7 +149,7 @@ SYM_FUNC_START(atomic64_add_unless_cx8)
addl $8, %esp
popl %ebx
popl %ebp
- ret
+ RET
4:
cmpl %edx, 4(%esp)
jne 2b
@@ -176,5 +176,5 @@ SYM_FUNC_START(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
popl %ebx
- ret
+ RET
SYM_FUNC_END(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 4304320e51f4..23318c338db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -127,7 +127,7 @@ SYM_FUNC_START(csum_partial)
8:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#else
@@ -245,7 +245,7 @@ SYM_FUNC_START(csum_partial)
90:
popl %ebx
popl %esi
- ret
+ RET
SYM_FUNC_END(csum_partial)
#endif
@@ -260,9 +260,9 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
* Copy from ds while checksumming, otherwise like csum_partial
*/
-#define EXC(y...) \
- 9999: y; \
- _ASM_EXTABLE_UA(9999b, 6001f)
+#define EXC(y...) \
+ 9999: y; \
+ _ASM_EXTABLE_TYPE(9999b, 7f, EX_TYPE_UACCESS | EX_FLAG_CLEAR_AX)
#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
@@ -358,20 +358,11 @@ EXC( movb %cl, (%edi) )
adcl $0, %eax
7:
-# Exception handler:
-.section .fixup, "ax"
-
-6001:
- xorl %eax, %eax
- jmp 7b
-
-.previous
-
popl %ebx
popl %esi
popl %edi
popl %ecx # equivalent to addl $4,%esp
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#else
@@ -439,15 +430,11 @@ EXC( movb %dl, (%edi) )
6: addl %edx, %eax
adcl $0, %eax
7:
-.section .fixup, "ax"
-6001: xorl %eax, %eax
- jmp 7b
-.previous
popl %esi
popl %edi
popl %ebx
- ret
+ RET
SYM_FUNC_END(csum_partial_copy_generic)
#undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index c4c7dd115953..fe59b8ac4fcc 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -17,7 +17,7 @@ SYM_FUNC_START(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
- ret
+ RET
SYM_FUNC_END(clear_page_rep)
EXPORT_SYMBOL_GPL(clear_page_rep)
@@ -39,7 +39,7 @@ SYM_FUNC_START(clear_page_orig)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
- ret
+ RET
SYM_FUNC_END(clear_page_orig)
EXPORT_SYMBOL_GPL(clear_page_orig)
@@ -47,6 +47,6 @@ SYM_FUNC_START(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
- ret
+ RET
SYM_FUNC_END(clear_page_erms)
EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 3542502faa3b..33c70c0160ea 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -37,11 +37,11 @@ SYM_FUNC_START(this_cpu_cmpxchg16b_emu)
popfq
mov $1, %al
- ret
+ RET
.Lnot_same:
popfq
xor %al,%al
- ret
+ RET
SYM_FUNC_END(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index ca01ed6029f4..6a912d58fecc 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl %ecx, 4(%esi)
popfl
- ret
+ RET
.Lnot_same:
movl (%esi), %eax
@@ -40,7 +40,7 @@ SYM_FUNC_START(cmpxchg8b_emu)
movl 4(%esi), %edx
popfl
- ret
+ RET
SYM_FUNC_END(cmpxchg8b_emu)
EXPORT_SYMBOL(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_mc_64.S b/arch/x86/lib/copy_mc_64.S
index 7334055157ba..c859a8a09860 100644
--- a/arch/x86/lib/copy_mc_64.S
+++ b/arch/x86/lib/copy_mc_64.S
@@ -77,10 +77,8 @@ SYM_FUNC_START(copy_mc_fragile)
.L_done_memcpy_trap:
xorl %eax, %eax
.L_done:
- ret
-SYM_FUNC_END(copy_mc_fragile)
+ RET
- .section .fixup, "ax"
/*
* Return number of bytes not copied for any failure. Note that
* there is no "tail" handling since the source buffer is 8-byte
@@ -105,14 +103,14 @@ SYM_FUNC_END(copy_mc_fragile)
movl %ecx, %edx
jmp copy_mc_fragile_handle_tail
- .previous
-
_ASM_EXTABLE_TYPE(.L_read_leading_bytes, .E_leading_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
_ASM_EXTABLE_TYPE(.L_read_words, .E_read_words, EX_TYPE_DEFAULT_MCE_SAFE)
_ASM_EXTABLE_TYPE(.L_read_trailing_bytes, .E_trailing_bytes, EX_TYPE_DEFAULT_MCE_SAFE)
_ASM_EXTABLE(.L_write_leading_bytes, .E_leading_bytes)
_ASM_EXTABLE(.L_write_words, .E_write_words)
_ASM_EXTABLE(.L_write_trailing_bytes, .E_trailing_bytes)
+
+SYM_FUNC_END(copy_mc_fragile)
#endif /* CONFIG_X86_MCE */
/*
@@ -132,10 +130,8 @@ SYM_FUNC_START(copy_mc_enhanced_fast_string)
rep movsb
/* Copy successful. Return zero */
xorl %eax, %eax
- ret
-SYM_FUNC_END(copy_mc_enhanced_fast_string)
+ RET
- .section .fixup, "ax"
.E_copy:
/*
* On fault %rcx is updated such that the copy instruction could
@@ -145,9 +141,9 @@ SYM_FUNC_END(copy_mc_enhanced_fast_string)
* user-copy routines.
*/
movq %rcx, %rax
- ret
-
- .previous
+ RET
_ASM_EXTABLE_TYPE(.L_copy, .E_copy, EX_TYPE_DEFAULT_MCE_SAFE)
+
+SYM_FUNC_END(copy_mc_enhanced_fast_string)
#endif /* !CONFIG_UML */
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index db4b4f9197c7..30ea644bf446 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -17,7 +17,7 @@ SYM_FUNC_START(copy_page)
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
- ret
+ RET
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)
@@ -85,5 +85,5 @@ SYM_FUNC_START_LOCAL(copy_page_regs)
movq (%rsp), %rbx
movq 1*8(%rsp), %r12
addq $2*8, %rsp
- ret
+ RET
SYM_FUNC_END(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 2797e630b9b1..8ca5ecf16dc4 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -32,14 +32,10 @@
decl %ecx
jnz 100b
102:
- .section .fixup,"ax"
-103: addl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
- .previous
- _ASM_EXTABLE_CPY(100b, 103b)
- _ASM_EXTABLE_CPY(101b, 103b)
- .endm
+ _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
+ _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
+.endm
/*
* copy_user_generic_unrolled - memory copy with exception handling.
@@ -105,9 +101,8 @@ SYM_FUNC_START(copy_user_generic_unrolled)
jnz 21b
23: xor %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
30: shll $6,%ecx
addl %ecx,%edx
jmp 60f
@@ -115,7 +110,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
jmp 60f
50: movl %ecx,%edx
60: jmp .Lcopy_user_handle_tail /* ecx is zerorest also */
- .previous
_ASM_EXTABLE_CPY(1b, 30b)
_ASM_EXTABLE_CPY(2b, 30b)
@@ -166,20 +160,16 @@ SYM_FUNC_START(copy_user_generic_string)
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
-1: rep
- movsq
+1: rep movsq
2: movl %edx,%ecx
-3: rep
- movsb
+3: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
11: leal (%rdx,%rcx,8),%ecx
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 11b)
_ASM_EXTABLE_CPY(3b, 12b)
@@ -200,19 +190,16 @@ EXPORT_SYMBOL(copy_user_generic_string)
*/
SYM_FUNC_START(copy_user_enhanced_fast_string)
ASM_STAC
- cmpl $64,%edx
- jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
+ /* CPUs without FSRM should avoid rep movsb for short copies */
+ ALTERNATIVE "cmpl $64, %edx; jb .L_copy_short_string", "", X86_FEATURE_FSRM
movl %edx,%ecx
-1: rep
- movsb
+1: rep movsb
xorl %eax,%eax
ASM_CLAC
- ret
+ RET
- .section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, 12b)
SYM_FUNC_END(copy_user_enhanced_fast_string)
@@ -225,6 +212,7 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* Don't try to copy the tail if machine check happened
*
* Input:
+ * eax trap number written by ex_handler_copy()
* rdi destination
* rsi source
* rdx count
@@ -233,13 +221,26 @@ EXPORT_SYMBOL(copy_user_enhanced_fast_string)
* eax uncopied bytes or 0 if successful.
*/
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
+ cmp $X86_TRAP_MC,%eax
+ je 3f
+
movl %edx,%ecx
1: rep movsb
2: mov %ecx,%eax
ASM_CLAC
- ret
+ RET
+
+3:
+ movl %edx,%eax
+ ASM_CLAC
+ RET
_ASM_EXTABLE_CPY(1b, 2b)
+
+.Lcopy_user_handle_align:
+ addl %ecx,%edx /* ecx is zerorest also */
+ jmp .Lcopy_user_handle_tail
+
SYM_CODE_END(.Lcopy_user_handle_tail)
/*
@@ -348,9 +349,8 @@ SYM_FUNC_START(__copy_user_nocache)
xorl %eax,%eax
ASM_CLAC
sfence
- ret
+ RET
- .section .fixup,"ax"
.L_fixup_4x8b_copy:
shll $6,%ecx
addl %ecx,%edx
@@ -366,7 +366,6 @@ SYM_FUNC_START(__copy_user_nocache)
.L_fixup_handle_tail:
sfence
jmp .Lcopy_user_handle_tail
- .previous
_ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
_ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 1fbd8ee9642d..d9e16a2cf285 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -201,7 +201,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
movq 3*8(%rsp), %r13
movq 4*8(%rsp), %r15
addq $5*8, %rsp
- ret
+ RET
.Lshort:
movl %ecx, %r10d
jmp .L1
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index e7925d668b68..1f8a8f895173 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -9,6 +9,7 @@
#include <linux/compiler.h>
#include <linux/export.h>
#include <asm/checksum.h>
+#include <asm/word-at-a-time.h>
static inline unsigned short from32to16(unsigned a)
{
@@ -21,120 +22,119 @@ static inline unsigned short from32to16(unsigned a)
}
/*
- * Do a 64-bit checksum on an arbitrary memory area.
+ * Do a checksum on an arbitrary memory area.
* Returns a 32bit checksum.
*
* This isn't as time critical as it used to be because many NICs
* do hardware checksumming these days.
- *
- * Things tried and found to not make it faster:
- * Manual Prefetching
- * Unrolling to an 128 bytes inner loop.
- * Using interleaving with more registers to break the carry chains.
+ *
+ * Still, with CHECKSUM_COMPLETE this is called to compute
+ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
*/
-static unsigned do_csum(const unsigned char *buff, unsigned len)
+__wsum csum_partial(const void *buff, int len, __wsum sum)
{
- unsigned odd, count;
- unsigned long result = 0;
+ u64 temp64 = (__force u64)sum;
+ unsigned odd, result;
- if (unlikely(len == 0))
- return result;
odd = 1 & (unsigned long) buff;
if (unlikely(odd)) {
- result = *buff << 8;
+ if (unlikely(len == 0))
+ return sum;
+ temp64 = ror32((__force u32)sum, 8);
+ temp64 += (*(unsigned char *)buff << 8);
len--;
buff++;
}
- count = len >> 1; /* nr of 16-bit words.. */
- if (count) {
- if (2 & (unsigned long) buff) {
- result += *(unsigned short *)buff;
- count--;
- len -= 2;
- buff += 2;
- }
- count >>= 1; /* nr of 32-bit words.. */
- if (count) {
- unsigned long zero;
- unsigned count64;
- if (4 & (unsigned long) buff) {
- result += *(unsigned int *) buff;
- count--;
- len -= 4;
- buff += 4;
- }
- count >>= 1; /* nr of 64-bit words.. */
- /* main loop using 64byte blocks */
- zero = 0;
- count64 = count >> 3;
- while (count64) {
- asm("addq 0*8(%[src]),%[res]\n\t"
- "adcq 1*8(%[src]),%[res]\n\t"
- "adcq 2*8(%[src]),%[res]\n\t"
- "adcq 3*8(%[src]),%[res]\n\t"
- "adcq 4*8(%[src]),%[res]\n\t"
- "adcq 5*8(%[src]),%[res]\n\t"
- "adcq 6*8(%[src]),%[res]\n\t"
- "adcq 7*8(%[src]),%[res]\n\t"
- "adcq %[zero],%[res]"
- : [res] "=r" (result)
- : [src] "r" (buff), [zero] "r" (zero),
- "[res]" (result));
- buff += 64;
- count64--;
- }
+ while (unlikely(len >= 64)) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 64;
+ len -= 64;
+ }
+
+ if (len & 32) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 32;
+ }
+ if (len & 16) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 16;
+ }
+ if (len & 8) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [src] "r" (buff)
+ : "memory");
+ buff += 8;
+ }
+ if (len & 7) {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+ unsigned int shift = (8 - (len & 7)) * 8;
+ unsigned long trail;
- /* last up to 7 8byte blocks */
- count %= 8;
- while (count) {
- asm("addq %1,%0\n\t"
- "adcq %2,%0\n"
- : "=r" (result)
- : "m" (*(unsigned long *)buff),
- "r" (zero), "0" (result));
- --count;
- buff += 8;
- }
- result = add32_with_carry(result>>32,
- result&0xffffffff);
+ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
- if (len & 4) {
- result += *(unsigned int *) buff;
- buff += 4;
- }
+ asm("addq %[trail],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [trail] "r" (trail));
+#else
+ if (len & 4) {
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u32 *)buff)
+ : "memory");
+ buff += 4;
}
if (len & 2) {
- result += *(unsigned short *) buff;
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u16 *)buff)
+ : "memory");
buff += 2;
}
+ if (len & 1) {
+ asm("addq %[val],%[res]\n\t"
+ "adcq $0,%[res]"
+ : [res] "+r" (temp64)
+ : [val] "r" ((u64)*(u8 *)buff)
+ : "memory");
+ }
+#endif
}
- if (len & 1)
- result += *buff;
- result = add32_with_carry(result>>32, result & 0xffffffff);
- if (unlikely(odd)) {
+ result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
+ if (unlikely(odd)) {
result = from32to16(result);
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
}
- return result;
-}
-
-/*
- * computes the checksum of a memory block at buff, length len,
- * and adds in "sum" (32-bit)
- *
- * returns a 32-bit number suitable for feeding into itself
- * or csum_tcpudp_magic
- *
- * this function must be called with even lengths, except
- * for the last fragment, which may be odd
- *
- * it's best to have buff aligned on a 64-bit boundary
- */
-__wsum csum_partial(const void *buff, int len, __wsum sum)
-{
- return (__force __wsum)add32_with_carry(do_csum(buff, len),
- (__force u32)sum);
+ return (__force __wsum)result;
}
EXPORT_SYMBOL(csum_partial);
@@ -147,4 +147,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
return csum_fold(csum_partial(buff,len,0));
}
EXPORT_SYMBOL(ip_compute_csum);
-
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index be5b5fb1598b..520897061ee0 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/linkage.h>
#include <linux/error-injection.h>
#include <linux/kprobes.h>
@@ -10,7 +11,7 @@ asm(
".type just_return_func, @function\n"
".globl just_return_func\n"
"just_return_func:\n"
- " ret\n"
+ ASM_RET
".size just_return_func, .-just_return_func\n"
);
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index fa1bc2104b32..b70d98d79a9d 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -57,7 +57,7 @@ SYM_FUNC_START(__get_user_1)
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
@@ -71,7 +71,7 @@ SYM_FUNC_START(__get_user_2)
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
@@ -85,7 +85,7 @@ SYM_FUNC_START(__get_user_4)
3: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
@@ -100,7 +100,7 @@ SYM_FUNC_START(__get_user_8)
4: movq (%_ASM_AX),%rdx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#else
LOAD_TASK_SIZE_MINUS_N(7)
cmp %_ASM_DX,%_ASM_AX
@@ -112,7 +112,7 @@ SYM_FUNC_START(__get_user_8)
5: movl 4(%_ASM_AX),%ecx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
#endif
SYM_FUNC_END(__get_user_8)
EXPORT_SYMBOL(__get_user_8)
@@ -124,7 +124,7 @@ SYM_FUNC_START(__get_user_nocheck_1)
6: movzbl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_1)
EXPORT_SYMBOL(__get_user_nocheck_1)
@@ -134,7 +134,7 @@ SYM_FUNC_START(__get_user_nocheck_2)
7: movzwl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_2)
EXPORT_SYMBOL(__get_user_nocheck_2)
@@ -144,7 +144,7 @@ SYM_FUNC_START(__get_user_nocheck_4)
8: movl (%_ASM_AX),%edx
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_4)
EXPORT_SYMBOL(__get_user_nocheck_4)
@@ -159,7 +159,7 @@ SYM_FUNC_START(__get_user_nocheck_8)
#endif
xor %eax,%eax
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__get_user_nocheck_8)
EXPORT_SYMBOL(__get_user_nocheck_8)
@@ -169,7 +169,7 @@ SYM_CODE_START_LOCAL(.Lbad_get_user_clac)
bad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_clac)
#ifdef CONFIG_X86_32
@@ -179,7 +179,7 @@ bad_get_user_8:
xor %edx,%edx
xor %ecx,%ecx
mov $(-EFAULT),%_ASM_AX
- ret
+ RET
SYM_CODE_END(.Lbad_get_user_8_clac)
#endif
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index dbf8cc97b7f5..12c16c6aa44a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(__sw_hweight32)
imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101
shrl $24, %eax # w = w_tmp >> 24
__ASM_SIZE(pop,) %__ASM_REG(dx)
- ret
+ RET
SYM_FUNC_END(__sw_hweight32)
EXPORT_SYMBOL(__sw_hweight32)
@@ -65,7 +65,7 @@ SYM_FUNC_START(__sw_hweight64)
popq %rdx
popq %rdi
- ret
+ RET
#else /* CONFIG_X86_32 */
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
pushl %ecx
@@ -77,7 +77,7 @@ SYM_FUNC_START(__sw_hweight64)
addl %ecx, %eax # result
popl %ecx
- ret
+ RET
#endif
SYM_FUNC_END(__sw_hweight64)
EXPORT_SYMBOL(__sw_hweight64)
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index eb3ccffb9b9d..b781d324211b 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -37,8 +37,6 @@ enum reg_type {
*/
static bool is_string_insn(struct insn *insn)
{
- insn_get_opcode(insn);
-
/* All string instructions have a 1-byte opcode. */
if (insn->opcode.nbytes != 1)
return false;
@@ -412,32 +410,44 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
#endif /* CONFIG_X86_64 */
}
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
- enum reg_type type)
+static const int pt_regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#else
+ offsetof(struct pt_regs, ds),
+ offsetof(struct pt_regs, es),
+ offsetof(struct pt_regs, fs),
+ offsetof(struct pt_regs, gs),
+#endif
+};
+
+int pt_regs_offset(struct pt_regs *regs, int regno)
+{
+ if ((unsigned)regno < ARRAY_SIZE(pt_regoff))
+ return pt_regoff[regno];
+ return -EDOM;
+}
+
+static int get_regno(struct insn *insn, enum reg_type type)
{
+ int nr_registers = ARRAY_SIZE(pt_regoff);
int regno = 0;
- static const int regoff[] = {
- offsetof(struct pt_regs, ax),
- offsetof(struct pt_regs, cx),
- offsetof(struct pt_regs, dx),
- offsetof(struct pt_regs, bx),
- offsetof(struct pt_regs, sp),
- offsetof(struct pt_regs, bp),
- offsetof(struct pt_regs, si),
- offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
- offsetof(struct pt_regs, r8),
- offsetof(struct pt_regs, r9),
- offsetof(struct pt_regs, r10),
- offsetof(struct pt_regs, r11),
- offsetof(struct pt_regs, r12),
- offsetof(struct pt_regs, r13),
- offsetof(struct pt_regs, r14),
- offsetof(struct pt_regs, r15),
-#endif
- };
- int nr_registers = ARRAY_SIZE(regoff);
/*
* Don't possibly decode a 32-bit instructions as
* reading a 64-bit-only register.
@@ -505,7 +515,18 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
WARN_ONCE(1, "decoded an instruction with an invalid register");
return -EINVAL;
}
- return regoff[regno];
+ return regno;
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+ enum reg_type type)
+{
+ int regno = get_regno(insn, type);
+
+ if (regno < 0)
+ return regno;
+
+ return pt_regs_offset(regs, regno);
}
/**
@@ -851,6 +872,26 @@ int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
}
/**
+ * insn_get_modrm_reg_ptr() - Obtain register pointer based on ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte.
+ * The register is obtained as a pointer within pt_regs.
+ */
+unsigned long *insn_get_modrm_reg_ptr(struct insn *insn, struct pt_regs *regs)
+{
+ int offset;
+
+ offset = insn_get_modrm_reg_off(insn, regs);
+ if (offset < 0)
+ return NULL;
+ return (void *)regs + offset;
+}
+
+/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
@@ -1405,6 +1446,9 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
if (!insn || !regs)
return (void __user *)-1L;
+ if (insn_get_opcode(insn))
+ return (void __user *)-1L;
+
switch (insn->addr_bytes) {
case 2:
return get_addr_ref_16(insn, regs);
@@ -1539,3 +1583,87 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
return true;
}
+
+/**
+ * insn_decode_mmio() - Decode a MMIO instruction
+ * @insn: Structure to store decoded instruction
+ * @bytes: Returns size of memory operand
+ *
+ * Decodes instruction that used for Memory-mapped I/O.
+ *
+ * Returns:
+ *
+ * Type of the instruction. Size of the memory operand is stored in
+ * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ */
+enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+{
+ enum mmio_type type = MMIO_DECODE_FAILED;
+
+ *bytes = 0;
+
+ if (insn_get_opcode(insn))
+ return MMIO_DECODE_FAILED;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x88: /* MOV m8,r8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE;
+ break;
+
+ case 0xc6: /* MOV m8, imm8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_WRITE_IMM;
+ break;
+
+ case 0x8a: /* MOV r8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_READ;
+ break;
+
+ case 0xa4: /* MOVS m8, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
+ if (!*bytes)
+ *bytes = insn->opnd_bytes;
+ type = MMIO_MOVS;
+ break;
+
+ case 0x0f: /* Two-byte instruction */
+ switch (insn->opcode.bytes[1]) {
+ case 0xb6: /* MOVZX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xb7: /* MOVZX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_ZERO_EXTEND;
+ break;
+
+ case 0xbe: /* MOVSX r16/r32/r64, m8 */
+ *bytes = 1;
+ fallthrough;
+ case 0xbf: /* MOVSX r32/r64, m16 */
+ if (!*bytes)
+ *bytes = 2;
+ type = MMIO_READ_SIGN_EXTEND;
+ break;
+ }
+ break;
+ }
+
+ return type;
+}
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index cb5a1964506b..a1f9416bf67a 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -11,5 +11,5 @@
SYM_FUNC_START(__iowrite32_copy)
movl %edx,%ecx
rep movsd
- ret
+ RET
SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index e565d1c9019e..3a6e6cfe8c35 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -7,11 +7,7 @@
__visible void *memcpy(void *to, const void *from, size_t n)
{
-#if defined(CONFIG_X86_USE_3DNOW) && !defined(CONFIG_FORTIFY_SOURCE)
- return __memcpy3d(to, from, n);
-#else
return __memcpy(to, from, n);
-#endif
}
EXPORT_SYMBOL(memcpy);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1cc9da6e29c7..59cf2343f3d9 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -39,7 +39,7 @@ SYM_FUNC_START_WEAK(memcpy)
rep movsq
movl %edx, %ecx
rep movsb
- ret
+ RET
SYM_FUNC_END(memcpy)
SYM_FUNC_END_ALIAS(__memcpy)
EXPORT_SYMBOL(memcpy)
@@ -53,7 +53,7 @@ SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
- ret
+ RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig)
@@ -137,7 +137,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq %r9, 1*8(%rdi)
movq %r10, -2*8(%rdi, %rdx)
movq %r11, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_16bytes:
cmpl $8, %edx
@@ -149,7 +149,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movq -1*8(%rsi, %rdx), %r9
movq %r8, 0*8(%rdi)
movq %r9, -1*8(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_8bytes:
cmpl $4, %edx
@@ -162,7 +162,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movl -4(%rsi, %rdx), %r8d
movl %ecx, (%rdi)
movl %r8d, -4(%rdi, %rdx)
- retq
+ RET
.p2align 4
.Lless_3bytes:
subl $1, %edx
@@ -180,7 +180,7 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
movb %cl, (%rdi)
.Lend:
- retq
+ RET
SYM_FUNC_END(memcpy_orig)
.popsection
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 64801010d312..50ea390df712 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memmove)
/* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
- ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+ ALTERNATIVE "", __stringify(movq %rdx, %rcx; rep movsb; RET), X86_FEATURE_ERMS
/*
* movsq instruction have many startup latency
@@ -205,7 +205,7 @@ SYM_FUNC_START(__memmove)
movb (%rsi), %r11b
movb %r11b, (%rdi)
13:
- retq
+ RET
SYM_FUNC_END(__memmove)
SYM_FUNC_END_ALIAS(memmove)
EXPORT_SYMBOL(__memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9827ae267f96..d624f2bc42f1 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -40,7 +40,7 @@ SYM_FUNC_START(__memset)
movl %edx,%ecx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(__memset)
SYM_FUNC_END_ALIAS(memset)
EXPORT_SYMBOL(memset)
@@ -63,7 +63,7 @@ SYM_FUNC_START_LOCAL(memset_erms)
movq %rdx,%rcx
rep stosb
movq %r9,%rax
- ret
+ RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig)
@@ -125,7 +125,7 @@ SYM_FUNC_START_LOCAL(memset_orig)
.Lende:
movq %r10,%rax
- ret
+ RET
.Lbad_alignment:
cmpq $7,%rdx
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc5f4ea943d3..e69de29bb2d1 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MMX 3DNow! library helper functions
- *
- * To do:
- * We can use MMX just for prefetch in IRQ's. This may be a win.
- * (reported so on K6-III)
- * We should use a better code neutral filler for the short jump
- * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
- * We also want to clobber the filler register so we don't get any
- * register forwarding stalls on the filler.
- *
- * Add *user handling. Checksums are not a win with MMX on any CPU
- * tested so far for any MMX solution figured.
- *
- * 22/09/2000 - Arjan van de Ven
- * Improved for non-engineering-sample Athlons
- *
- */
-#include <linux/hardirq.h>
-#include <linux/string.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-#include <asm/fpu/api.h>
-#include <asm/asm.h>
-
-/*
- * Use KFPU_387. MMX instructions are not affected by MXCSR,
- * but both AMD and Intel documentation states that even integer MMX
- * operations will result in #MF if an exception is pending in FCW.
- *
- * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
- * any subsequent user of the 387 stack will reinitialize it using
- * KFPU_387.
- */
-
-void *_mmx_memcpy(void *to, const void *from, size_t len)
-{
- void *p;
- int i;
-
- if (unlikely(in_interrupt()))
- return __memcpy(to, from, len);
-
- p = to;
- i = len >> 6; /* len/64 */
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n" /* This set is 28 bytes */
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from));
-
- for ( ; i > 5; i--) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for ( ; i > 0; i--) {
- __asm__ __volatile__ (
- " movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- /*
- * Now do the tail of the block:
- */
- __memcpy(to, from, len & 63);
- kernel_fpu_end();
-
- return p;
-}
-EXPORT_SYMBOL(_mmx_memcpy);
-
-#ifdef CONFIG_MK7
-
-/*
- * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
- * other MMX using processors do not.
- */
-
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- " movntq %%mm0, (%0)\n"
- " movntq %%mm0, 8(%0)\n"
- " movntq %%mm0, 16(%0)\n"
- " movntq %%mm0, 24(%0)\n"
- " movntq %%mm0, 32(%0)\n"
- " movntq %%mm0, 40(%0)\n"
- " movntq %%mm0, 48(%0)\n"
- " movntq %%mm0, 56(%0)\n"
- : : "r" (page) : "memory");
- page += 64;
- }
-
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence\n"::);
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- /*
- * maybe the prefetch stuff can go before the expensive fnsave...
- * but that is for later. -AV
- */
- __asm__ __volatile__(
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < (4096-320)/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
-
- for (i = (4096-320)/64; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "2: movq (%0), %%mm0\n"
- " movntq %%mm0, (%1)\n"
- " movq 8(%0), %%mm1\n"
- " movntq %%mm1, 8(%1)\n"
- " movq 16(%0), %%mm2\n"
- " movntq %%mm2, 16(%1)\n"
- " movq 24(%0), %%mm3\n"
- " movntq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm4\n"
- " movntq %%mm4, 32(%1)\n"
- " movq 40(%0), %%mm5\n"
- " movntq %%mm5, 40(%1)\n"
- " movq 48(%0), %%mm6\n"
- " movntq %%mm6, 48(%1)\n"
- " movq 56(%0), %%mm7\n"
- " movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from += 64;
- to += 64;
- }
- /*
- * Since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again:
- */
- __asm__ __volatile__("sfence \n"::);
- kernel_fpu_end();
-}
-
-#else /* CONFIG_MK7 */
-
-/*
- * Generic MMX implementation without K7 specific streaming
- */
-static void fast_clear_page(void *page)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- " pxor %%mm0, %%mm0\n" : :
- );
-
- for (i = 0; i < 4096/128; i++) {
- __asm__ __volatile__ (
- " movq %%mm0, (%0)\n"
- " movq %%mm0, 8(%0)\n"
- " movq %%mm0, 16(%0)\n"
- " movq %%mm0, 24(%0)\n"
- " movq %%mm0, 32(%0)\n"
- " movq %%mm0, 40(%0)\n"
- " movq %%mm0, 48(%0)\n"
- " movq %%mm0, 56(%0)\n"
- " movq %%mm0, 64(%0)\n"
- " movq %%mm0, 72(%0)\n"
- " movq %%mm0, 80(%0)\n"
- " movq %%mm0, 88(%0)\n"
- " movq %%mm0, 96(%0)\n"
- " movq %%mm0, 104(%0)\n"
- " movq %%mm0, 112(%0)\n"
- " movq %%mm0, 120(%0)\n"
- : : "r" (page) : "memory");
- page += 128;
- }
-
- kernel_fpu_end();
-}
-
-static void fast_copy_page(void *to, void *from)
-{
- int i;
-
- kernel_fpu_begin_mask(KFPU_387);
-
- __asm__ __volatile__ (
- "1: prefetch (%0)\n"
- " prefetch 64(%0)\n"
- " prefetch 128(%0)\n"
- " prefetch 192(%0)\n"
- " prefetch 256(%0)\n"
- "2: \n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b) : : "r" (from));
-
- for (i = 0; i < 4096/64; i++) {
- __asm__ __volatile__ (
- "1: prefetch 320(%0)\n"
- "2: movq (%0), %%mm0\n"
- " movq 8(%0), %%mm1\n"
- " movq 16(%0), %%mm2\n"
- " movq 24(%0), %%mm3\n"
- " movq %%mm0, (%1)\n"
- " movq %%mm1, 8(%1)\n"
- " movq %%mm2, 16(%1)\n"
- " movq %%mm3, 24(%1)\n"
- " movq 32(%0), %%mm0\n"
- " movq 40(%0), %%mm1\n"
- " movq 48(%0), %%mm2\n"
- " movq 56(%0), %%mm3\n"
- " movq %%mm0, 32(%1)\n"
- " movq %%mm1, 40(%1)\n"
- " movq %%mm2, 48(%1)\n"
- " movq %%mm3, 56(%1)\n"
- ".section .fixup, \"ax\"\n"
- "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : : "r" (from), "r" (to) : "memory");
-
- from += 64;
- to += 64;
- }
- kernel_fpu_end();
-}
-
-#endif /* !CONFIG_MK7 */
-
-/*
- * Favour MMX for page clear and copy:
- */
-static void slow_zero_page(void *page)
-{
- int d0, d1;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; stosl"
-
- : "=&c" (d0), "=&D" (d1)
- :"a" (0), "1" (page), "0" (1024)
- :"memory");
-}
-
-void mmx_clear_page(void *page)
-{
- if (unlikely(in_interrupt()))
- slow_zero_page(page);
- else
- fast_clear_page(page);
-}
-EXPORT_SYMBOL(mmx_clear_page);
-
-static void slow_copy_page(void *to, void *from)
-{
- int d0, d1, d2;
-
- __asm__ __volatile__(
- "cld\n\t"
- "rep ; movsl"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (1024), "1" ((long) to), "2" ((long) from)
- : "memory");
-}
-
-void mmx_copy_page(void *to, void *from)
-{
- if (unlikely(in_interrupt()))
- slow_copy_page(to, from);
- else
- fast_copy_page(to, from);
-}
-EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index a2b9caa5274c..ebd259f31496 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -35,7 +35,7 @@ SYM_FUNC_START(\op\()_safe_regs)
movl %edi, 28(%r10)
popq %r12
popq %rbx
- ret
+ RET
3:
movl $-EIO, %r11d
jmp 2b
@@ -77,7 +77,7 @@ SYM_FUNC_START(\op\()_safe_regs)
popl %esi
popl %ebp
popl %ebx
- ret
+ RET
3:
movl $-EIO, 4(%esp)
jmp 2b
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 0ea344c5ea43..ecb2049c1273 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -52,7 +52,7 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
@@ -66,7 +66,7 @@ SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
2: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
@@ -80,7 +80,7 @@ SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
3: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
- ret
+ RET
SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index cf0b39f97adc..89b3fb244e15 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -23,7 +23,7 @@
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
- ret
+ RET
.endm
.macro THUNK reg
@@ -34,7 +34,7 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
__stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg; int3), X86_FEATURE_RETPOLINE_AMD
.endm
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 7d290777246d..422257c350c6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -8,7 +8,6 @@
*/
#include <linux/export.h>
#include <linux/uaccess.h>
-#include <asm/mmx.h>
#include <asm/asm.h>
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -43,11 +42,7 @@ do { \
" movl %2,%0\n" \
"1: rep; stosb\n" \
"2: " ASM_CLAC "\n" \
- ".section .fixup,\"ax\"\n" \
- "3: lea 0(%2,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %2) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0) \
: "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
@@ -149,10 +144,6 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
"36: movl %%eax, %0\n"
"37: rep; movsb\n"
"100:\n"
- ".section .fixup,\"ax\"\n"
- "101: lea 0(%%eax,%0,4),%0\n"
- " jmp 100b\n"
- ".previous\n"
_ASM_EXTABLE_UA(1b, 100b)
_ASM_EXTABLE_UA(2b, 100b)
_ASM_EXTABLE_UA(3b, 100b)
@@ -190,7 +181,7 @@ __copy_user_intel(void __user *to, const void *from, unsigned long size)
_ASM_EXTABLE_UA(35b, 100b)
_ASM_EXTABLE_UA(36b, 100b)
_ASM_EXTABLE_UA(37b, 100b)
- _ASM_EXTABLE_UA(99b, 101b)
+ _ASM_EXTABLE_TYPE_REG(99b, 100b, EX_TYPE_UCOPY_LEN4, %%eax)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -255,30 +246,26 @@ static unsigned long __copy_user_intel_nocache(void *to,
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
- ".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: jmp 8b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 16b)
- _ASM_EXTABLE_UA(1b, 16b)
- _ASM_EXTABLE_UA(2b, 16b)
- _ASM_EXTABLE_UA(21b, 16b)
- _ASM_EXTABLE_UA(3b, 16b)
- _ASM_EXTABLE_UA(31b, 16b)
- _ASM_EXTABLE_UA(4b, 16b)
- _ASM_EXTABLE_UA(41b, 16b)
- _ASM_EXTABLE_UA(10b, 16b)
- _ASM_EXTABLE_UA(51b, 16b)
- _ASM_EXTABLE_UA(11b, 16b)
- _ASM_EXTABLE_UA(61b, 16b)
- _ASM_EXTABLE_UA(12b, 16b)
- _ASM_EXTABLE_UA(71b, 16b)
- _ASM_EXTABLE_UA(13b, 16b)
- _ASM_EXTABLE_UA(81b, 16b)
- _ASM_EXTABLE_UA(14b, 16b)
- _ASM_EXTABLE_UA(91b, 16b)
- _ASM_EXTABLE_UA(6b, 9b)
- _ASM_EXTABLE_UA(7b, 16b)
+ _ASM_EXTABLE_UA(0b, 8b)
+ _ASM_EXTABLE_UA(1b, 8b)
+ _ASM_EXTABLE_UA(2b, 8b)
+ _ASM_EXTABLE_UA(21b, 8b)
+ _ASM_EXTABLE_UA(3b, 8b)
+ _ASM_EXTABLE_UA(31b, 8b)
+ _ASM_EXTABLE_UA(4b, 8b)
+ _ASM_EXTABLE_UA(41b, 8b)
+ _ASM_EXTABLE_UA(10b, 8b)
+ _ASM_EXTABLE_UA(51b, 8b)
+ _ASM_EXTABLE_UA(11b, 8b)
+ _ASM_EXTABLE_UA(61b, 8b)
+ _ASM_EXTABLE_UA(12b, 8b)
+ _ASM_EXTABLE_UA(71b, 8b)
+ _ASM_EXTABLE_UA(13b, 8b)
+ _ASM_EXTABLE_UA(81b, 8b)
+ _ASM_EXTABLE_UA(14b, 8b)
+ _ASM_EXTABLE_UA(91b, 8b)
+ _ASM_EXTABLE_TYPE_REG(6b, 8b, EX_TYPE_UCOPY_LEN4, %%eax)
+ _ASM_EXTABLE_UA(7b, 8b)
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -315,14 +302,8 @@ do { \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "5: addl %3,%0\n" \
- " jmp 2b\n" \
- "3: lea 0(%3,%0,4),%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE_UA(4b, 5b) \
- _ASM_EXTABLE_UA(0b, 3b) \
+ _ASM_EXTABLE_TYPE_REG(4b, 2b, EX_TYPE_UCOPY_LEN1, %3) \
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN4, %3) \
_ASM_EXTABLE_UA(1b, 2b) \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 508c81e97ab1..0402a749f3a0 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -35,12 +35,10 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
" incq %[dst]\n"
" decl %%ecx ; jnz 1b\n"
"2:\n"
- ".section .fixup,\"ax\"\n"
- "3: lea 0(%[size1],%[size8],8),%[size8]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE_UA(0b, 3b)
+
+ _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
_ASM_EXTABLE_UA(1b, 2b)
+
: [size8] "=&c"(size), [dst] "=&D" (__d0)
: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
clac();
diff --git a/arch/x86/math-emu/div_Xsig.S b/arch/x86/math-emu/div_Xsig.S
index 951da2ad54bb..8c270ab415be 100644
--- a/arch/x86/math-emu/div_Xsig.S
+++ b/arch/x86/math-emu/div_Xsig.S
@@ -341,7 +341,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#ifdef PARANOID
diff --git a/arch/x86/math-emu/div_small.S b/arch/x86/math-emu/div_small.S
index d047d1816abe..637439bfefa4 100644
--- a/arch/x86/math-emu/div_small.S
+++ b/arch/x86/math-emu/div_small.S
@@ -44,5 +44,5 @@ SYM_FUNC_START(FPU_div_small)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_div_small)
diff --git a/arch/x86/math-emu/mul_Xsig.S b/arch/x86/math-emu/mul_Xsig.S
index 4afc7b1fa6e9..54a031b66142 100644
--- a/arch/x86/math-emu/mul_Xsig.S
+++ b/arch/x86/math-emu/mul_Xsig.S
@@ -62,7 +62,7 @@ SYM_FUNC_START(mul32_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul32_Xsig)
@@ -115,7 +115,7 @@ SYM_FUNC_START(mul64_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul64_Xsig)
@@ -175,5 +175,5 @@ SYM_FUNC_START(mul_Xsig_Xsig)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(mul_Xsig_Xsig)
diff --git a/arch/x86/math-emu/polynom_Xsig.S b/arch/x86/math-emu/polynom_Xsig.S
index 702315eecb86..35fd723fc0df 100644
--- a/arch/x86/math-emu/polynom_Xsig.S
+++ b/arch/x86/math-emu/polynom_Xsig.S
@@ -133,5 +133,5 @@ L_accum_done:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(polynomial_Xsig)
diff --git a/arch/x86/math-emu/reg_norm.S b/arch/x86/math-emu/reg_norm.S
index cad1d60b1e84..594936eeed67 100644
--- a/arch/x86/math-emu/reg_norm.S
+++ b/arch/x86/math-emu/reg_norm.S
@@ -72,7 +72,7 @@ L_exit_valid:
L_exit:
popl %ebx
leave
- ret
+ RET
L_zero:
@@ -138,7 +138,7 @@ L_exit_nuo_valid:
popl %ebx
leave
- ret
+ RET
L_exit_nuo_zero:
movl TAG_Zero,%eax
@@ -146,5 +146,5 @@ L_exit_nuo_zero:
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(FPU_normalize_nuo)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index 4a9fc3cc5a4d..0bb2a092161a 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -437,7 +437,7 @@ fpu_Arith_exit:
popl %edi
popl %esi
leave
- ret
+ RET
/*
diff --git a/arch/x86/math-emu/reg_u_add.S b/arch/x86/math-emu/reg_u_add.S
index 9c9e2c810afe..07247287a3af 100644
--- a/arch/x86/math-emu/reg_u_add.S
+++ b/arch/x86/math-emu/reg_u_add.S
@@ -164,6 +164,6 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_add)
diff --git a/arch/x86/math-emu/reg_u_div.S b/arch/x86/math-emu/reg_u_div.S
index e2fb5c2644c5..b5a41e2fc484 100644
--- a/arch/x86/math-emu/reg_u_div.S
+++ b/arch/x86/math-emu/reg_u_div.S
@@ -468,7 +468,7 @@ L_exit:
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_div)
diff --git a/arch/x86/math-emu/reg_u_mul.S b/arch/x86/math-emu/reg_u_mul.S
index 0c779c87ac5b..e2588b24b8c2 100644
--- a/arch/x86/math-emu/reg_u_mul.S
+++ b/arch/x86/math-emu/reg_u_mul.S
@@ -144,7 +144,7 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
#endif /* PARANOID */
SYM_FUNC_END(FPU_u_mul)
diff --git a/arch/x86/math-emu/reg_u_sub.S b/arch/x86/math-emu/reg_u_sub.S
index e9bb7c248649..4c900c29e4ff 100644
--- a/arch/x86/math-emu/reg_u_sub.S
+++ b/arch/x86/math-emu/reg_u_sub.S
@@ -270,5 +270,5 @@ L_exit:
popl %edi
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_u_sub)
diff --git a/arch/x86/math-emu/round_Xsig.S b/arch/x86/math-emu/round_Xsig.S
index d9d7de8dbd7b..126c40473bad 100644
--- a/arch/x86/math-emu/round_Xsig.S
+++ b/arch/x86/math-emu/round_Xsig.S
@@ -78,7 +78,7 @@ L_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(round_Xsig)
@@ -138,5 +138,5 @@ L_n_exit:
popl %esi
popl %ebx
leave
- ret
+ RET
SYM_FUNC_END(norm_Xsig)
diff --git a/arch/x86/math-emu/shr_Xsig.S b/arch/x86/math-emu/shr_Xsig.S
index 726af985f758..f726bf6f6396 100644
--- a/arch/x86/math-emu/shr_Xsig.S
+++ b/arch/x86/math-emu/shr_Xsig.S
@@ -45,7 +45,7 @@ SYM_FUNC_START(shr_Xsig)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -61,7 +61,7 @@ L_more_than_31:
movl $0,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -76,7 +76,7 @@ L_more_than_63:
movl %edx,8(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -85,5 +85,5 @@ L_more_than_95:
movl %eax,8(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(shr_Xsig)
diff --git a/arch/x86/math-emu/wm_shrx.S b/arch/x86/math-emu/wm_shrx.S
index 4fc89174caf0..f608a28a4c43 100644
--- a/arch/x86/math-emu/wm_shrx.S
+++ b/arch/x86/math-emu/wm_shrx.S
@@ -55,7 +55,7 @@ SYM_FUNC_START(FPU_shrx)
popl %ebx
popl %esi
leave
- ret
+ RET
L_more_than_31:
cmpl $64,%ecx
@@ -70,7 +70,7 @@ L_more_than_31:
movl $0,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_63:
cmpl $96,%ecx
@@ -84,7 +84,7 @@ L_more_than_63:
movl %edx,4(%esi)
popl %esi
leave
- ret
+ RET
L_more_than_95:
xorl %eax,%eax
@@ -92,7 +92,7 @@ L_more_than_95:
movl %eax,4(%esi)
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrx)
@@ -146,7 +146,7 @@ SYM_FUNC_START(FPU_shrxs)
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [0..31] bits */
Ls_less_than_32:
@@ -163,7 +163,7 @@ Ls_less_than_32:
popl %ebx
popl %esi
leave
- ret
+ RET
/* Shift by [64..95] bits */
Ls_more_than_63:
@@ -189,7 +189,7 @@ Ls_more_than_63:
popl %ebx
popl %esi
leave
- ret
+ RET
Ls_more_than_95:
/* Shift by [96..inf) bits */
@@ -203,5 +203,5 @@ Ls_more_than_95:
popl %ebx
popl %esi
leave
- ret
+ RET
SYM_FUNC_END(FPU_shrxs)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 5864219221ca..fe3d3061fc11 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,9 +2,11 @@
# Kernel does not boot with instrumentation of tlb.c and mem_encrypt*.c
KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
+KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KASAN_SANITIZE_mem_encrypt.o := n
+KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
@@ -13,6 +15,7 @@ KCSAN_SANITIZE := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
+CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
@@ -52,6 +55,8 @@ obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_X86_MEM_ENCRYPT) += mem_encrypt.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_amd.o
+
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_identity.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 5cd2a88930a9..dba2197c05c3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,12 +2,26 @@
#include <linux/extable.h>
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
+#include <linux/bitfield.h>
#include <xen/xen.h>
#include <asm/fpu/api.h>
#include <asm/sev.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
+#include <asm/insn-eval.h>
+#include <asm/sgx.h>
+
+static inline unsigned long *pt_regs_nr(struct pt_regs *regs, int nr)
+{
+ int reg_offset = pt_regs_offset(regs, nr);
+ static unsigned long __dummy;
+
+ if (WARN_ON_ONCE(reg_offset < 0))
+ return &__dummy;
+
+ return (unsigned long *)((unsigned long)regs + reg_offset);
+}
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
@@ -15,10 +29,15 @@ ex_fixup_addr(const struct exception_table_entry *x)
return (unsigned long)&x->fixup + x->fixup;
}
-static bool ex_handler_default(const struct exception_table_entry *fixup,
+static bool ex_handler_default(const struct exception_table_entry *e,
struct pt_regs *regs)
{
- regs->ip = ex_fixup_addr(fixup);
+ if (e->data & EX_FLAG_CLEAR_AX)
+ regs->ax = 0;
+ if (e->data & EX_FLAG_CLEAR_DX)
+ regs->dx = 0;
+
+ regs->ip = ex_fixup_addr(e);
return true;
}
@@ -29,6 +48,13 @@ static bool ex_handler_fault(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
+static bool ex_handler_sgx(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ax = trapnr | SGX_ENCLS_FAULT_FLAG;
+ return ex_handler_default(fixup, regs);
+}
+
/*
* Handler for when we fail to restore a task's FPU state. We should never get
* here because the FPU state of a task using the FPU (task->thread.fpu.state)
@@ -65,28 +91,29 @@ static bool ex_handler_copy(const struct exception_table_entry *fixup,
return ex_handler_fault(fixup, regs, trapnr);
}
-static bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs)
+static bool ex_handler_msr(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
- if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ if (!safe && wrmsr &&
+ pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
+
+ if (!safe && !wrmsr &&
+ pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
(unsigned int)regs->cx, regs->ip, (void *)regs->ip))
show_stack_regs(regs);
- /* Pretend that the read succeeded and returned 0. */
- regs->ax = 0;
- regs->dx = 0;
- return ex_handler_default(fixup, regs);
-}
+ if (!wrmsr) {
+ /* Pretend that the read succeeded and returned 0. */
+ regs->ax = 0;
+ regs->dx = 0;
+ }
-static bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
- struct pt_regs *regs)
-{
- if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx,
- (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
- show_stack_regs(regs);
+ if (safe)
+ *pt_regs_nr(regs, reg) = -EIO;
- /* Pretend that the write succeeded. */
return ex_handler_default(fixup, regs);
}
@@ -99,17 +126,32 @@ static bool ex_handler_clear_fs(const struct exception_table_entry *fixup,
return ex_handler_default(fixup, regs);
}
+static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int reg, int imm)
+{
+ *pt_regs_nr(regs, reg) = (long)imm;
+ return ex_handler_default(fixup, regs);
+}
+
+static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr, int reg, int imm)
+{
+ regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
+ return ex_handler_uaccess(fixup, regs, trapnr);
+}
+
int ex_get_fixup_type(unsigned long ip)
{
const struct exception_table_entry *e = search_exception_tables(ip);
- return e ? e->type : EX_TYPE_NONE;
+ return e ? FIELD_GET(EX_DATA_TYPE_MASK, e->data) : EX_TYPE_NONE;
}
int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
unsigned long fault_addr)
{
const struct exception_table_entry *e;
+ int type, reg, imm;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -129,7 +171,11 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
if (!e)
return 0;
- switch (e->type) {
+ type = FIELD_GET(EX_DATA_TYPE_MASK, e->data);
+ reg = FIELD_GET(EX_DATA_REG_MASK, e->data);
+ imm = FIELD_GET(EX_DATA_IMM_MASK, e->data);
+
+ switch (type) {
case EX_TYPE_DEFAULT:
case EX_TYPE_DEFAULT_MCE_SAFE:
return ex_handler_default(e, regs);
@@ -144,18 +190,31 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
return ex_handler_clear_fs(e, regs);
case EX_TYPE_FPU_RESTORE:
return ex_handler_fprestore(e, regs);
- case EX_TYPE_RDMSR:
- return ex_handler_rdmsr_unsafe(e, regs);
- case EX_TYPE_WRMSR:
- return ex_handler_wrmsr_unsafe(e, regs);
case EX_TYPE_BPF:
return ex_handler_bpf(e, regs);
- case EX_TYPE_RDMSR_IN_MCE:
- ex_handler_msr_mce(regs, false);
- break;
+ case EX_TYPE_WRMSR:
+ return ex_handler_msr(e, regs, true, false, reg);
+ case EX_TYPE_RDMSR:
+ return ex_handler_msr(e, regs, false, false, reg);
+ case EX_TYPE_WRMSR_SAFE:
+ return ex_handler_msr(e, regs, true, true, reg);
+ case EX_TYPE_RDMSR_SAFE:
+ return ex_handler_msr(e, regs, false, true, reg);
case EX_TYPE_WRMSR_IN_MCE:
ex_handler_msr_mce(regs, true);
break;
+ case EX_TYPE_RDMSR_IN_MCE:
+ ex_handler_msr_mce(regs, false);
+ break;
+ case EX_TYPE_POP_REG:
+ regs->sp += sizeof(long);
+ fallthrough;
+ case EX_TYPE_IMM_REG:
+ return ex_handler_imm_reg(e, regs, reg, imm);
+ case EX_TYPE_FAULT_SGX:
+ return ex_handler_sgx(e, regs, trapnr);
+ case EX_TYPE_UCOPY_LEN:
+ return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
}
BUG();
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4bfed53e210e..d0074c6ed31a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1413,8 +1413,7 @@ good_area:
* and if there is a fatal signal pending there is no guarantee
* that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) &&
- (flags & FAULT_FLAG_ALLOW_RETRY))) {
+ if (unlikely(fault & VM_FAULT_RETRY)) {
flags |= FAULT_FLAG_TRIED;
goto retry;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1895986842b9..4ba024d5b63a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start,
static void __init init_trampoline(void)
{
#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
if (!kaslr_memory_enabled())
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 36098226a957..96d34ebb20a9 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -981,7 +981,7 @@ static void __meminit free_pagetable(struct page *page, int order)
if (PageReserved(page)) {
__ClearPageReserved(page);
- magic = (unsigned long)page->freelist;
+ magic = page->index;
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
while (nr_pages--)
put_page_bootmem(page++);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 35487305d8af..50d209939c66 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -1,419 +1,18 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * AMD Memory Encryption Support
+ * Memory Encryption Support Common Code
*
* Copyright (C) 2016 Advanced Micro Devices, Inc.
*
* Author: Tom Lendacky <thomas.lendacky@amd.com>
*/
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/dma-direct.h>
+#include <linux/dma-mapping.h>
#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
#include <linux/mem_encrypt.h>
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/dma-mapping.h>
#include <linux/virtio_config.h>
-#include <linux/cc_platform.h>
-
-#include <asm/tlbflush.h>
-#include <asm/fixmap.h>
-#include <asm/setup.h>
-#include <asm/bootparam.h>
-#include <asm/set_memory.h>
-#include <asm/cacheflush.h>
-#include <asm/processor-flags.h>
-#include <asm/msr.h>
-#include <asm/cmdline.h>
-
-#include "mm_internal.h"
-
-/*
- * Since SME related variables are set early in the boot process they must
- * reside in the .data section so as not to be zeroed out when the .bss
- * section is later cleared.
- */
-u64 sme_me_mask __section(".data") = 0;
-u64 sev_status __section(".data") = 0;
-u64 sev_check_data __section(".data") = 0;
-EXPORT_SYMBOL(sme_me_mask);
-DEFINE_STATIC_KEY_FALSE(sev_enable_key);
-EXPORT_SYMBOL_GPL(sev_enable_key);
-
-/* Buffer used for early in-place encryption by BSP, no locking needed */
-static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
-
-/*
- * This routine does not change the underlying encryption setting of the
- * page(s) that map this memory. It assumes that eventually the memory is
- * meant to be accessed as either encrypted or decrypted but the contents
- * are currently not in the desired state.
- *
- * This routine follows the steps outlined in the AMD64 Architecture
- * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
- */
-static void __init __sme_early_enc_dec(resource_size_t paddr,
- unsigned long size, bool enc)
-{
- void *src, *dst;
- size_t len;
-
- if (!sme_me_mask)
- return;
-
- wbinvd();
-
- /*
- * There are limited number of early mapping slots, so map (at most)
- * one page at time.
- */
- while (size) {
- len = min_t(size_t, sizeof(sme_early_buffer), size);
-
- /*
- * Create mappings for the current and desired format of
- * the memory. Use a write-protected mapping for the source.
- */
- src = enc ? early_memremap_decrypted_wp(paddr, len) :
- early_memremap_encrypted_wp(paddr, len);
-
- dst = enc ? early_memremap_encrypted(paddr, len) :
- early_memremap_decrypted(paddr, len);
-
- /*
- * If a mapping can't be obtained to perform the operation,
- * then eventual access of that area in the desired mode
- * will cause a crash.
- */
- BUG_ON(!src || !dst);
-
- /*
- * Use a temporary buffer, of cache-line multiple size, to
- * avoid data corruption as documented in the APM.
- */
- memcpy(sme_early_buffer, src, len);
- memcpy(dst, sme_early_buffer, len);
-
- early_memunmap(dst, len);
- early_memunmap(src, len);
-
- paddr += len;
- size -= len;
- }
-}
-
-void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, true);
-}
-
-void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
-{
- __sme_early_enc_dec(paddr, size, false);
-}
-
-static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
- bool map)
-{
- unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
- pmdval_t pmd_flags, pmd;
-
- /* Use early_pmd_flags but remove the encryption mask */
- pmd_flags = __sme_clr(early_pmd_flags);
-
- do {
- pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
- __early_make_pgtable((unsigned long)vaddr, pmd);
-
- vaddr += PMD_SIZE;
- paddr += PMD_SIZE;
- size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
- } while (size);
-
- flush_tlb_local();
-}
-
-void __init sme_unmap_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- /* Get the command line address before unmapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
-}
-
-void __init sme_map_bootdata(char *real_mode_data)
-{
- struct boot_params *boot_data;
- unsigned long cmdline_paddr;
-
- if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
- return;
-
- __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
-
- /* Get the command line address after mapping the real_mode_data */
- boot_data = (struct boot_params *)real_mode_data;
- cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
-
- if (!cmdline_paddr)
- return;
-
- __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
-}
-
-void __init sme_early_init(void)
-{
- unsigned int i;
-
- if (!sme_me_mask)
- return;
-
- early_pmd_flags = __sme_set(early_pmd_flags);
-
- __supported_pte_mask = __sme_set(__supported_pte_mask);
-
- /* Update the protection map with memory encryption mask */
- for (i = 0; i < ARRAY_SIZE(protection_map); i++)
- protection_map[i] = pgprot_encrypted(protection_map[i]);
-
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- swiotlb_force = SWIOTLB_FORCE;
-}
-
-void __init sev_setup_arch(void)
-{
- phys_addr_t total_mem = memblock_phys_mem_size();
- unsigned long size;
-
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
- return;
-
- /*
- * For SEV, all DMA has to occur via shared/unencrypted pages.
- * SEV uses SWIOTLB to make this happen without changing device
- * drivers. However, depending on the workload being run, the
- * default 64MB of SWIOTLB may not be enough and SWIOTLB may
- * run out of buffers for DMA, resulting in I/O errors and/or
- * performance degradation especially with high I/O workloads.
- *
- * Adjust the default size of SWIOTLB for SEV guests using
- * a percentage of guest memory for SWIOTLB buffers.
- * Also, as the SWIOTLB bounce buffer memory is allocated
- * from low memory, ensure that the adjusted size is within
- * the limits of low available memory.
- *
- * The percentage of guest memory used here for SWIOTLB buffers
- * is more of an approximation of the static adjustment which
- * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
- */
- size = total_mem * 6 / 100;
- size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
- swiotlb_adjust_size(size);
-}
-
-static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
-{
- unsigned long pfn = 0;
- pgprot_t prot;
-
- switch (level) {
- case PG_LEVEL_4K:
- pfn = pte_pfn(*kpte);
- prot = pte_pgprot(*kpte);
- break;
- case PG_LEVEL_2M:
- pfn = pmd_pfn(*(pmd_t *)kpte);
- prot = pmd_pgprot(*(pmd_t *)kpte);
- break;
- case PG_LEVEL_1G:
- pfn = pud_pfn(*(pud_t *)kpte);
- prot = pud_pgprot(*(pud_t *)kpte);
- break;
- default:
- WARN_ONCE(1, "Invalid level for kpte\n");
- return 0;
- }
-
- if (ret_prot)
- *ret_prot = prot;
-
- return pfn;
-}
-
-void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
-{
-#ifdef CONFIG_PARAVIRT
- unsigned long sz = npages << PAGE_SHIFT;
- unsigned long vaddr_end = vaddr + sz;
-
- while (vaddr < vaddr_end) {
- int psize, pmask, level;
- unsigned long pfn;
- pte_t *kpte;
-
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- WARN_ONCE(1, "kpte lookup for vaddr\n");
- return;
- }
-
- pfn = pg_level_to_pfn(level, kpte, NULL);
- if (!pfn)
- continue;
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
-
- vaddr = (vaddr & pmask) + psize;
- }
-#endif
-}
-
-static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
-{
- pgprot_t old_prot, new_prot;
- unsigned long pfn, pa, size;
- pte_t new_pte;
-
- pfn = pg_level_to_pfn(level, kpte, &old_prot);
- if (!pfn)
- return;
-
- new_prot = old_prot;
- if (enc)
- pgprot_val(new_prot) |= _PAGE_ENC;
- else
- pgprot_val(new_prot) &= ~_PAGE_ENC;
-
- /* If prot is same then do nothing. */
- if (pgprot_val(old_prot) == pgprot_val(new_prot))
- return;
-
- pa = pfn << PAGE_SHIFT;
- size = page_level_size(level);
-
- /*
- * We are going to perform in-place en-/decryption and change the
- * physical page attribute from C=1 to C=0 or vice versa. Flush the
- * caches to ensure that data gets accessed with the correct C-bit.
- */
- clflush_cache_range(__va(pa), size);
-
- /* Encrypt/decrypt the contents in-place */
- if (enc)
- sme_early_encrypt(pa, size);
- else
- sme_early_decrypt(pa, size);
-
- /* Change the page encryption mask. */
- new_pte = pfn_pte(pfn, new_prot);
- set_pte_atomic(kpte, new_pte);
-}
-
-static int __init early_set_memory_enc_dec(unsigned long vaddr,
- unsigned long size, bool enc)
-{
- unsigned long vaddr_end, vaddr_next, start;
- unsigned long psize, pmask;
- int split_page_size_mask;
- int level, ret;
- pte_t *kpte;
-
- start = vaddr;
- vaddr_next = vaddr;
- vaddr_end = vaddr + size;
-
- for (; vaddr < vaddr_end; vaddr = vaddr_next) {
- kpte = lookup_address(vaddr, &level);
- if (!kpte || pte_none(*kpte)) {
- ret = 1;
- goto out;
- }
-
- if (level == PG_LEVEL_4K) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
- continue;
- }
-
- psize = page_level_size(level);
- pmask = page_level_mask(level);
-
- /*
- * Check whether we can change the large page in one go.
- * We request a split when the address is not aligned and
- * the number of pages to set/clear encryption bit is smaller
- * than the number of pages in the large page.
- */
- if (vaddr == (vaddr & pmask) &&
- ((vaddr_end - vaddr) >= psize)) {
- __set_clr_pte_enc(kpte, level, enc);
- vaddr_next = (vaddr & pmask) + psize;
- continue;
- }
-
- /*
- * The virtual address is part of a larger page, create the next
- * level page table mapping (4K or 2M). If it is part of a 2M
- * page then we request a split of the large page into 4K
- * chunks. A 1GB large page is split into 2M pages, resp.
- */
- if (level == PG_LEVEL_2M)
- split_page_size_mask = 0;
- else
- split_page_size_mask = 1 << PG_LEVEL_2M;
-
- /*
- * kernel_physical_mapping_change() does not flush the TLBs, so
- * a TLB flush is required after we exit from the for loop.
- */
- kernel_physical_mapping_change(__pa(vaddr & pmask),
- __pa((vaddr_end & pmask) + psize),
- split_page_size_mask);
- }
-
- ret = 0;
-
- notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
-out:
- __flush_tlb_all();
- return ret;
-}
-
-int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, false);
-}
-
-int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
-{
- return early_set_memory_enc_dec(vaddr, size, true);
-}
-
-void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
-{
- notify_range_enc_status_changed(vaddr, npages, enc);
-}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
bool force_dma_unencrypted(struct device *dev)
@@ -441,30 +40,6 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-void __init mem_encrypt_free_decrypted_mem(void)
-{
- unsigned long vaddr, vaddr_end, npages;
- int r;
-
- vaddr = (unsigned long)__start_bss_decrypted_unused;
- vaddr_end = (unsigned long)__end_bss_decrypted;
- npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
-
- /*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
- */
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
- r = set_memory_encrypted(vaddr, npages);
- if (r) {
- pr_warn("failed to free unused decrypted pages\n");
- return;
- }
- }
-
- free_init_pages("unused decrypted", vaddr, vaddr_end);
-}
-
static void print_mem_encrypt_feature_info(void)
{
pr_info("AMD Memory Encryption Features active:");
@@ -493,20 +68,12 @@ static void print_mem_encrypt_feature_info(void)
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
- if (!sme_me_mask)
+ if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return;
/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
swiotlb_update_mem_attributes();
- /*
- * With SEV, we need to unroll the rep string I/O instructions,
- * but SEV-ES supports them through the #VC handler.
- */
- if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
- !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- static_branch_enable(&sev_enable_key);
-
print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
new file mode 100644
index 000000000000..2b2d018ea345
--- /dev/null
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -0,0 +1,438 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ */
+
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/linkage.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/dma-direct.h>
+#include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
+#include <linux/cc_platform.h>
+
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+#include <asm/setup.h>
+#include <asm/bootparam.h>
+#include <asm/set_memory.h>
+#include <asm/cacheflush.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/cmdline.h>
+
+#include "mm_internal.h"
+
+/*
+ * Since SME related variables are set early in the boot process they must
+ * reside in the .data section so as not to be zeroed out when the .bss
+ * section is later cleared.
+ */
+u64 sme_me_mask __section(".data") = 0;
+u64 sev_status __section(".data") = 0;
+u64 sev_check_data __section(".data") = 0;
+EXPORT_SYMBOL(sme_me_mask);
+
+/* Buffer used for early in-place encryption by BSP, no locking needed */
+static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE);
+
+/*
+ * This routine does not change the underlying encryption setting of the
+ * page(s) that map this memory. It assumes that eventually the memory is
+ * meant to be accessed as either encrypted or decrypted but the contents
+ * are currently not in the desired state.
+ *
+ * This routine follows the steps outlined in the AMD64 Architecture
+ * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place.
+ */
+static void __init __sme_early_enc_dec(resource_size_t paddr,
+ unsigned long size, bool enc)
+{
+ void *src, *dst;
+ size_t len;
+
+ if (!sme_me_mask)
+ return;
+
+ wbinvd();
+
+ /*
+ * There are limited number of early mapping slots, so map (at most)
+ * one page at time.
+ */
+ while (size) {
+ len = min_t(size_t, sizeof(sme_early_buffer), size);
+
+ /*
+ * Create mappings for the current and desired format of
+ * the memory. Use a write-protected mapping for the source.
+ */
+ src = enc ? early_memremap_decrypted_wp(paddr, len) :
+ early_memremap_encrypted_wp(paddr, len);
+
+ dst = enc ? early_memremap_encrypted(paddr, len) :
+ early_memremap_decrypted(paddr, len);
+
+ /*
+ * If a mapping can't be obtained to perform the operation,
+ * then eventual access of that area in the desired mode
+ * will cause a crash.
+ */
+ BUG_ON(!src || !dst);
+
+ /*
+ * Use a temporary buffer, of cache-line multiple size, to
+ * avoid data corruption as documented in the APM.
+ */
+ memcpy(sme_early_buffer, src, len);
+ memcpy(dst, sme_early_buffer, len);
+
+ early_memunmap(dst, len);
+ early_memunmap(src, len);
+
+ paddr += len;
+ size -= len;
+ }
+}
+
+void __init sme_early_encrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, true);
+}
+
+void __init sme_early_decrypt(resource_size_t paddr, unsigned long size)
+{
+ __sme_early_enc_dec(paddr, size, false);
+}
+
+static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
+ bool map)
+{
+ unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET;
+ pmdval_t pmd_flags, pmd;
+
+ /* Use early_pmd_flags but remove the encryption mask */
+ pmd_flags = __sme_clr(early_pmd_flags);
+
+ do {
+ pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0;
+ __early_make_pgtable((unsigned long)vaddr, pmd);
+
+ vaddr += PMD_SIZE;
+ paddr += PMD_SIZE;
+ size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
+ } while (size);
+
+ flush_tlb_local();
+}
+
+void __init sme_unmap_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ /* Get the command line address before unmapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false);
+}
+
+void __init sme_map_bootdata(char *real_mode_data)
+{
+ struct boot_params *boot_data;
+ unsigned long cmdline_paddr;
+
+ if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
+ return;
+
+ __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true);
+
+ /* Get the command line address after mapping the real_mode_data */
+ boot_data = (struct boot_params *)real_mode_data;
+ cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32);
+
+ if (!cmdline_paddr)
+ return;
+
+ __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
+}
+
+void __init sme_early_init(void)
+{
+ unsigned int i;
+
+ if (!sme_me_mask)
+ return;
+
+ early_pmd_flags = __sme_set(early_pmd_flags);
+
+ __supported_pte_mask = __sme_set(__supported_pte_mask);
+
+ /* Update the protection map with memory encryption mask */
+ for (i = 0; i < ARRAY_SIZE(protection_map); i++)
+ protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ swiotlb_force = SWIOTLB_FORCE;
+}
+
+void __init sev_setup_arch(void)
+{
+ phys_addr_t total_mem = memblock_phys_mem_size();
+ unsigned long size;
+
+ if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ return;
+
+ /*
+ * For SEV, all DMA has to occur via shared/unencrypted pages.
+ * SEV uses SWIOTLB to make this happen without changing device
+ * drivers. However, depending on the workload being run, the
+ * default 64MB of SWIOTLB may not be enough and SWIOTLB may
+ * run out of buffers for DMA, resulting in I/O errors and/or
+ * performance degradation especially with high I/O workloads.
+ *
+ * Adjust the default size of SWIOTLB for SEV guests using
+ * a percentage of guest memory for SWIOTLB buffers.
+ * Also, as the SWIOTLB bounce buffer memory is allocated
+ * from low memory, ensure that the adjusted size is within
+ * the limits of low available memory.
+ *
+ * The percentage of guest memory used here for SWIOTLB buffers
+ * is more of an approximation of the static adjustment which
+ * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6%
+ */
+ size = total_mem * 6 / 100;
+ size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G);
+ swiotlb_adjust_size(size);
+}
+
+static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
+{
+ unsigned long pfn = 0;
+ pgprot_t prot;
+
+ switch (level) {
+ case PG_LEVEL_4K:
+ pfn = pte_pfn(*kpte);
+ prot = pte_pgprot(*kpte);
+ break;
+ case PG_LEVEL_2M:
+ pfn = pmd_pfn(*(pmd_t *)kpte);
+ prot = pmd_pgprot(*(pmd_t *)kpte);
+ break;
+ case PG_LEVEL_1G:
+ pfn = pud_pfn(*(pud_t *)kpte);
+ prot = pud_pgprot(*(pud_t *)kpte);
+ break;
+ default:
+ WARN_ONCE(1, "Invalid level for kpte\n");
+ return 0;
+ }
+
+ if (ret_prot)
+ *ret_prot = prot;
+
+ return pfn;
+}
+
+void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc)
+{
+#ifdef CONFIG_PARAVIRT
+ unsigned long sz = npages << PAGE_SHIFT;
+ unsigned long vaddr_end = vaddr + sz;
+
+ while (vaddr < vaddr_end) {
+ int psize, pmask, level;
+ unsigned long pfn;
+ pte_t *kpte;
+
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ WARN_ONCE(1, "kpte lookup for vaddr\n");
+ return;
+ }
+
+ pfn = pg_level_to_pfn(level, kpte, NULL);
+ if (!pfn)
+ continue;
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc);
+
+ vaddr = (vaddr & pmask) + psize;
+ }
+#endif
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+ pgprot_t old_prot, new_prot;
+ unsigned long pfn, pa, size;
+ pte_t new_pte;
+
+ pfn = pg_level_to_pfn(level, kpte, &old_prot);
+ if (!pfn)
+ return;
+
+ new_prot = old_prot;
+ if (enc)
+ pgprot_val(new_prot) |= _PAGE_ENC;
+ else
+ pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+ /* If prot is same then do nothing. */
+ if (pgprot_val(old_prot) == pgprot_val(new_prot))
+ return;
+
+ pa = pfn << PAGE_SHIFT;
+ size = page_level_size(level);
+
+ /*
+ * We are going to perform in-place en-/decryption and change the
+ * physical page attribute from C=1 to C=0 or vice versa. Flush the
+ * caches to ensure that data gets accessed with the correct C-bit.
+ */
+ clflush_cache_range(__va(pa), size);
+
+ /* Encrypt/decrypt the contents in-place */
+ if (enc)
+ sme_early_encrypt(pa, size);
+ else
+ sme_early_decrypt(pa, size);
+
+ /* Change the page encryption mask. */
+ new_pte = pfn_pte(pfn, new_prot);
+ set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+ unsigned long size, bool enc)
+{
+ unsigned long vaddr_end, vaddr_next, start;
+ unsigned long psize, pmask;
+ int split_page_size_mask;
+ int level, ret;
+ pte_t *kpte;
+
+ start = vaddr;
+ vaddr_next = vaddr;
+ vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ kpte = lookup_address(vaddr, &level);
+ if (!kpte || pte_none(*kpte)) {
+ ret = 1;
+ goto out;
+ }
+
+ if (level == PG_LEVEL_4K) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+ continue;
+ }
+
+ psize = page_level_size(level);
+ pmask = page_level_mask(level);
+
+ /*
+ * Check whether we can change the large page in one go.
+ * We request a split when the address is not aligned and
+ * the number of pages to set/clear encryption bit is smaller
+ * than the number of pages in the large page.
+ */
+ if (vaddr == (vaddr & pmask) &&
+ ((vaddr_end - vaddr) >= psize)) {
+ __set_clr_pte_enc(kpte, level, enc);
+ vaddr_next = (vaddr & pmask) + psize;
+ continue;
+ }
+
+ /*
+ * The virtual address is part of a larger page, create the next
+ * level page table mapping (4K or 2M). If it is part of a 2M
+ * page then we request a split of the large page into 4K
+ * chunks. A 1GB large page is split into 2M pages, resp.
+ */
+ if (level == PG_LEVEL_2M)
+ split_page_size_mask = 0;
+ else
+ split_page_size_mask = 1 << PG_LEVEL_2M;
+
+ /*
+ * kernel_physical_mapping_change() does not flush the TLBs, so
+ * a TLB flush is required after we exit from the for loop.
+ */
+ kernel_physical_mapping_change(__pa(vaddr & pmask),
+ __pa((vaddr_end & pmask) + psize),
+ split_page_size_mask);
+ }
+
+ ret = 0;
+
+ notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc);
+out:
+ __flush_tlb_all();
+ return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+ return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc)
+{
+ notify_range_enc_status_changed(vaddr, npages, enc);
+}
+
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 17d292b7072f..3d1dba05fce4 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -65,7 +65,7 @@ SYM_FUNC_START(sme_encrypt_execute)
movq %rbp, %rsp /* Restore original stack pointer */
pop %rbp
- ret
+ RET
SYM_FUNC_END(sme_encrypt_execute)
SYM_FUNC_START(__enc_copy)
@@ -151,6 +151,6 @@ SYM_FUNC_START(__enc_copy)
pop %r12
pop %r15
- ret
+ RET
.L__enc_copy_end:
SYM_FUNC_END(__enc_copy)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 59ba2968af1b..a6cf56a14939 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -361,7 +361,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
{
- unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long next_tif = read_task_thread_flags(next);
unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
/*
@@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_global(void)
{
- unsigned long cr4, flags;
+ unsigned long flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) {
/*
@@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* toggle PGE */
- native_write_cr4(cr4 ^ X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
raw_local_irq_restore(flags);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 726700fabca6..2b1e266ff95c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * bpf_jit_comp.c: BPF JIT compiler
+ * BPF JIT compiler
*
* Copyright (C) 2011-2013 Eric Dumazet (eric.dumazet@gmail.com)
- * Internal BPF Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/netdevice.h>
#include <linux/filter.h>
@@ -412,7 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
- * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
@@ -446,14 +446,14 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
EMIT2(X86_JBE, offset); /* jbe out */
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
- EMIT2(X86_JA, offset); /* ja out */
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
@@ -504,14 +504,14 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
int offset;
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_direct_label - (prog + 2 - start);
- EMIT2(X86_JA, offset); /* ja out */
+ EMIT2(X86_JAE, offset); /* jae out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
@@ -1252,19 +1252,54 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* test src_reg, src_reg */
- maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
- EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
- /* jne start_of_ldx */
- EMIT2(X86_JNE, 0);
+ /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
+ * add abs(insn->off) to the limit to make sure that negative
+ * offset won't be an issue.
+ * insn->off is s16, so it won't affect valid pointers.
+ */
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
+ u8 *end_of_jmp1, *end_of_jmp2;
+
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * 1. src_reg + insn->off >= limit
+ * 2. src_reg + insn->off doesn't become small positive.
+ * Cannot do src_reg + insn->off >= limit in one branch,
+ * since it needs two spare registers, but JIT has only one.
+ */
+
+ /* movabsq r11, limit */
+ EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
+ EMIT((u32)limit, 4);
+ EMIT(limit >> 32, 4);
+ /* cmp src_reg, r11 */
+ maybe_emit_mod(&prog, src_reg, AUX_REG, true);
+ EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
+ /* if unsigned '<' goto end_of_jmp2 */
+ EMIT2(X86_JB, 0);
+ end_of_jmp1 = prog;
+
+ /* mov r11, src_reg */
+ emit_mov_reg(&prog, true, AUX_REG, src_reg);
+ /* add r11, insn->off */
+ maybe_emit_1mod(&prog, AUX_REG, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
+ /* jmp if not carry to start_of_ldx
+ * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
+ * that has to be rejected.
+ */
+ EMIT2(0x73 /* JNC */, 0);
+ end_of_jmp2 = prog;
+
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JNE above */
- temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ /* populate jmp_offset for JB above to jump to xor dst_reg */
+ end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
+ /* populate jmp_offset for JNC above to jump to start_of_ldx */
start_of_ldx = prog;
+ end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
}
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
@@ -1291,7 +1326,7 @@ st: if (is_imm8(insn->off))
}
ex->insn = delta;
- ex->type = EX_TYPE_BPF;
+ ex->data = EX_TYPE_BPF;
if (dst_reg > BPF_REG_9) {
pr_err("verifier error\n");
@@ -1305,7 +1340,7 @@ st: if (is_imm8(insn->off))
* End result: x86 insn "mov rbx, qword ptr [rax+0x14]"
* of 4 bytes will be ignored and rbx will be zero inited.
*/
- ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8);
+ ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8);
}
break;
@@ -1941,7 +1976,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
void *orig_call)
{
int ret, i, nr_args = m->nr_args;
- int stack_size = nr_args * 8;
+ int regs_off, ip_off, args_off, stack_size = nr_args * 8;
struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN];
@@ -1956,14 +1991,39 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (!is_valid_bpf_tramp_flags(flags))
return -EINVAL;
+ /* Generated trampoline stack layout:
+ *
+ * RBP + 8 [ return address ]
+ * RBP + 0 [ RBP ]
+ *
+ * RBP - 8 [ return value ] BPF_TRAMP_F_CALL_ORIG or
+ * BPF_TRAMP_F_RET_FENTRY_RET flags
+ *
+ * [ reg_argN ] always
+ * [ ... ]
+ * RBP - regs_off [ reg_arg1 ] program's ctx pointer
+ *
+ * RBP - args_off [ args count ] always
+ *
+ * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
+ */
+
/* room for return value of orig_call or fentry prog */
save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET);
if (save_ret)
stack_size += 8;
+ regs_off = stack_size;
+
+ /* args count */
+ stack_size += 8;
+ args_off = stack_size;
+
if (flags & BPF_TRAMP_F_IP_ARG)
stack_size += 8; /* room for IP address argument */
+ ip_off = stack_size;
+
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@@ -1977,23 +2037,25 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
+ /* Store number of arguments of the traced function:
+ * mov rax, nr_args
+ * mov QWORD PTR [rbp - args_off], rax
+ */
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
* mov rax, QWORD PTR [rbp + 8]
* sub rax, X86_PATCH_SIZE
- * mov QWORD PTR [rbp - stack_size], rax
+ * mov QWORD PTR [rbp - ip_off], rax
*/
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
-
- /* Continue with stack_size for regs storage, stack will
- * be correctly restored with 'leave' instruction.
- */
- stack_size -= 8;
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_args, stack_size);
+ save_regs(m, &prog, nr_args, regs_off);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
@@ -2005,7 +2067,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (fentry->nr_progs)
- if (invoke_bpf(m, &prog, fentry, stack_size,
+ if (invoke_bpf(m, &prog, fentry, regs_off,
flags & BPF_TRAMP_F_RET_FENTRY_RET))
return -EINVAL;
@@ -2015,7 +2077,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (!branches)
return -ENOMEM;
- if (invoke_bpf_mod_ret(m, &prog, fmod_ret, stack_size,
+ if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off,
branches)) {
ret = -EINVAL;
goto cleanup;
@@ -2023,7 +2085,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* call original function */
if (emit_call(&prog, orig_call, prog)) {
@@ -2053,13 +2115,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (fexit->nr_progs)
- if (invoke_bpf(m, &prog, fexit, stack_size, false)) {
+ if (invoke_bpf(m, &prog, fexit, regs_off, false)) {
ret = -EINVAL;
goto cleanup;
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_args, stack_size);
+ restore_regs(m, &prog, nr_args, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index da9b7cfa4632..429a89c5468b 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1323,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
/*
- * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
lo = (u32)MAX_TAIL_CALL_CNT;
@@ -1337,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip)
/* cmp ecx,lo */
EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo);
- /* ja out */
+ /* jae out */
EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
/* add eax,0x1 */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 948656069cdd..052f1d78a562 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -20,7 +20,7 @@ struct pci_root_info {
};
static bool pci_use_crs = true;
-static bool pci_ignore_seg = false;
+static bool pci_ignore_seg;
static int __init set_use_crs(const struct dmi_system_id *id)
{
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 2edd86649468..615a76d70019 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -353,8 +353,8 @@ static void pci_fixup_video(struct pci_dev *pdev)
}
}
}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
static const struct dmi_system_id msi_k8t_dmi_table[] = {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 12da00558631..9bb1e2941179 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -184,7 +184,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret)
goto error;
i = 0;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
(type == PCI_CAP_ID_MSI) ? nvec : 1,
(type == PCI_CAP_ID_MSIX) ?
@@ -235,7 +235,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
pirq = xen_allocate_pirq_msi(dev, msidesc);
if (pirq < 0) {
irq = -ENODEV;
@@ -270,7 +270,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret = 0;
struct msi_desc *msidesc;
- for_each_pci_msi_entry(msidesc, dev) {
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_NOTASSOCIATED) {
struct physdev_map_pirq map_irq;
domid_t domid;
@@ -306,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return -EINVAL;
map_irq.table_base = pci_resource_start(dev, bir);
- map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ map_irq.entry_nr = msidesc->msi_index;
}
ret = -EINVAL;
@@ -351,10 +351,13 @@ out:
return ret;
}
-static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
+bool xen_initdom_restore_msi(struct pci_dev *dev)
{
int ret = 0;
+ if (!xen_initial_domain())
+ return true;
+
if (pci_seg_supported) {
struct physdev_pci_device restore_ext;
@@ -375,10 +378,10 @@ static void xen_initdom_restore_msi_irqs(struct pci_dev *dev)
ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret);
}
+ return false;
}
#else /* CONFIG_XEN_PV_DOM0 */
#define xen_initdom_setup_msi_irqs NULL
-#define xen_initdom_restore_msi_irqs NULL
#endif /* !CONFIG_XEN_PV_DOM0 */
static void xen_teardown_msi_irqs(struct pci_dev *dev)
@@ -386,19 +389,15 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
struct msi_desc *msidesc;
int i;
- for_each_pci_msi_entry(msidesc, dev) {
- if (msidesc->irq) {
- for (i = 0; i < msidesc->nvec_used; i++)
- xen_destroy_irq(msidesc->irq + i);
- }
+ msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
+ for (i = 0; i < msidesc->nvec_used; i++)
+ xen_destroy_irq(msidesc->irq + i);
}
}
static void xen_pv_teardown_msi_irqs(struct pci_dev *dev)
{
- struct msi_desc *msidesc = first_pci_msi_entry(dev);
-
- if (msidesc->msi_attrib.is_msix)
+ if (dev->msix_enabled)
xen_pci_frontend_disable_msix(dev);
else
xen_pci_frontend_disable_msi(dev);
@@ -414,10 +413,7 @@ static int xen_msi_domain_alloc_irqs(struct irq_domain *domain,
if (WARN_ON_ONCE(!dev_is_pci(dev)))
return -EINVAL;
- if (first_msi_entry(dev)->msi_attrib.is_msix)
- type = PCI_CAP_ID_MSIX;
- else
- type = PCI_CAP_ID_MSI;
+ type = to_pci_dev(dev)->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
return xen_msi_ops.setup_msi_irqs(to_pci_dev(dev), nvec, type);
}
@@ -466,12 +462,10 @@ static __init struct irq_domain *xen_create_pci_msi_domain(void)
static __init void xen_setup_pci_msi(void)
{
if (xen_pv_domain()) {
- if (xen_initial_domain()) {
+ if (xen_initial_domain())
xen_msi_ops.setup_msi_irqs = xen_initdom_setup_msi_irqs;
- x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
- } else {
+ else
xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs;
- }
xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs;
pci_msi_ignore_mask = 1;
} else if (xen_hvm_domain()) {
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index 0ac3d4357136..65fa3d866226 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -249,7 +249,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
@@ -263,7 +263,7 @@
gpio@26 {
#gpio-cells = <2>;
- compatible = "ti,pcf8575";
+ compatible = "nxp,pcf8575";
reg = <0x26>;
gpio-controller;
};
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index 09ec84f6ef51..f3cfdb1c9a35 100644
--- a/arch/x86/platform/efi/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -56,5 +56,5 @@ SYM_FUNC_START(efi_call_svam)
movl 16(%esp), %ebx
leave
- ret
+ RET
SYM_FUNC_END(efi_call_svam)
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 90380a17ab23..2206b8bc47b8 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -23,5 +23,5 @@ SYM_FUNC_START(__efi_call)
mov %rsi, %rcx
CALL_NOSPEC rdi
leave
- ret
+ RET
SYM_FUNC_END(__efi_call)
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index fd3dd1708eba..25799d768624 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -37,6 +37,17 @@ SYM_CODE_START(__efi64_thunk)
push %rax
/*
+ * Copy args passed via the stack
+ */
+ subq $0x24, %rsp
+ movq 0x18(%rax), %rbp
+ movq 0x20(%rax), %rbx
+ movq 0x28(%rax), %rax
+ movl %ebp, 0x18(%rsp)
+ movl %ebx, 0x1c(%rsp)
+ movl %eax, 0x20(%rsp)
+
+ /*
* Calculate the physical address of the kernel text.
*/
movq $__START_KERNEL_map, %rax
@@ -47,7 +58,6 @@ SYM_CODE_START(__efi64_thunk)
subq %rax, %rbp
subq %rax, %rbx
- subq $28, %rsp
movl %ebx, 0x0(%rsp) /* return address */
movl %esi, 0x4(%rsp)
movl %edx, 0x8(%rsp)
@@ -60,10 +70,10 @@ SYM_CODE_START(__efi64_thunk)
pushq %rdi /* EFI runtime service address */
lretq
-1: movq 24(%rsp), %rsp
+1: movq 0x20(%rsp), %rsp
pop %rbx
pop %rbp
- retq
+ RET
.code32
2: pushl $__KERNEL_CS
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index b15ebfe40a73..b0b848d6933a 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -277,7 +277,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
return;
}
- new = early_memremap(data.phys_map, data.size);
+ new = early_memremap_prot(data.phys_map, data.size,
+ pgprot_val(pgprot_encrypted(FIXMAP_PAGE_NORMAL)));
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
index 75f4faff8468..3a5abffe5660 100644
--- a/arch/x86/platform/olpc/xo1-wakeup.S
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -77,7 +77,7 @@ save_registers:
pushfl
popl saved_context_eflags
- ret
+ RET
restore_registers:
movl saved_context_ebp, %ebp
@@ -88,7 +88,7 @@ restore_registers:
pushl saved_context_eflags
popfl
- ret
+ RET
SYM_CODE_START(do_olpc_suspend_lowlevel)
call save_processor_state
@@ -109,7 +109,7 @@ ret_point:
call restore_registers
call restore_processor_state
- ret
+ RET
SYM_CODE_END(do_olpc_suspend_lowlevel)
.data
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 8786653ad3c0..5606a15cf9a1 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_CODE_START(restore_image)
@@ -108,5 +108,5 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movl %eax, in_suspend
- ret
+ RET
SYM_FUNC_END(restore_registers)
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index d9bed596d849..0a0539e1cc81 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -66,7 +66,7 @@ SYM_FUNC_START(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
- ret
+ RET
SYM_FUNC_END(restore_registers)
SYM_FUNC_START(swsusp_arch_suspend)
@@ -96,7 +96,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
FRAME_BEGIN
call swsusp_save
FRAME_END
- ret
+ RET
SYM_FUNC_END(swsusp_arch_suspend)
SYM_FUNC_START(restore_image)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 95ea17a9d20c..ae53d54d7959 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -16,7 +16,7 @@ CFLAGS_sha256.o := -D__DISABLE_EXPORTS
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
-PURGATORY_LDFLAGS := -e purgatory_start -nostdlib -z nodefaultlib
+PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
LDFLAGS_purgatory.ro := -r $(PURGATORY_LDFLAGS)
LDFLAGS_purgatory.chk := $(PURGATORY_LDFLAGS)
targets += purgatory.ro purgatory.chk
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 4a3da7592b99..c5e29db02a46 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -17,6 +17,32 @@ u32 *trampoline_cr4_features;
/* Hold the pgd entry used on booting additional CPUs */
pgd_t trampoline_pgd_entry;
+void load_trampoline_pgtable(void)
+{
+#ifdef CONFIG_X86_32
+ load_cr3(initial_page_table);
+#else
+ /*
+ * This function is called before exiting to real-mode and that will
+ * fail with CR4.PCIDE still set.
+ */
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
+
+ write_cr3(real_mode_header->trampoline_pgd);
+#endif
+
+ /*
+ * The CR3 write above will not flush global TLB entries.
+ * Stale, global entries from previous page tables may still be
+ * present. Flush those stale entries.
+ *
+ * This ensures that memory accessed while running with
+ * trampoline_pgd is *actually* mapped into trampoline_pgd.
+ */
+ __flush_tlb_all();
+}
+
void __init reserve_real_mode(void)
{
phys_addr_t mem;
@@ -72,6 +98,7 @@ static void __init setup_real_mode(void)
#ifdef CONFIG_X86_64
u64 *trampoline_pgd;
u64 efer;
+ int i;
#endif
base = (unsigned char *)real_mode_header;
@@ -128,8 +155,17 @@ static void __init setup_real_mode(void)
trampoline_header->flags = 0;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
+
+ /* Map the real mode stub as virtual == physical */
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
- trampoline_pgd[511] = init_top_pgt[511].pgd;
+
+ /*
+ * Include the entirety of the kernel mapping into the trampoline
+ * PGD. This way, all mappings present in the normal kernel page
+ * tables are usable while running on trampoline_pgd.
+ */
+ for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
+ trampoline_pgd[i] = init_top_pgt[i].pgd;
#endif
sme_sev_setup_real_mode(trampoline_header);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index c736cf2ac76b..e2c5b296120d 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -68,7 +68,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"(__parainstructions|__alt_instructions)(_end)?|"
"(__iommu_table|__apicdrivers|__smp_locks)(_end)?|"
"__(start|end)_pci_.*|"
-#if CONFIG_FW_LOADER_BUILTIN
+#if CONFIG_FW_LOADER
"__(start|end)_builtin_fw|"
#endif
"__(start|stop)___ksymtab(_gpl)?|"
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 95d26a69088b..40d6a06e41c8 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -8,7 +8,6 @@ endmenu
config UML_X86
def_bool y
- select GENERIC_FIND_FIRST_BIT
config 64BIT
bool "64-bit kernel" if "$(SUBARCH)" = "x86"
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 5ccb18290d71..ba5789c35809 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -40,7 +40,7 @@ $(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
-Iarch/x86/include/generated
targets += user-offsets.s
-include/generated/user_constants.h: $(obj)/user-offsets.s
+include/generated/user_constants.h: $(obj)/user-offsets.s FORCE
$(call filechk,offsets,__USER_CONSTANT_H__)
UNPROFILE_OBJS := stub_segv.o
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 165be7f9a964..4da336965698 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -2,6 +2,7 @@
#ifndef _ASM_UM_BARRIER_H_
#define _ASM_UM_BARRIER_H_
+#include <asm/cpufeatures.h>
#include <asm/alternative.h>
/*
diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h
index 453db377150d..2ef507bc6989 100644
--- a/arch/x86/um/asm/segment.h
+++ b/arch/x86/um/asm/segment.h
@@ -8,12 +8,4 @@ extern int host_gdt_entry_tls_min;
#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
-typedef struct {
- unsigned long seg;
-} mm_segment_t;
-
-#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
-#define KERNEL_DS MAKE_MM_SEG(~0UL)
-#define USER_DS MAKE_MM_SEG(TASK_SIZE)
-
#endif
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index 13f118dec74f..aed782ab7721 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -110,7 +110,7 @@ csum_partial:
7:
popl %ebx
popl %esi
- ret
+ RET
#else
@@ -208,7 +208,7 @@ csum_partial:
80:
popl %ebx
popl %esi
- ret
+ RET
#endif
EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c
index 3c423dfcd78b..df8f4b4bf98b 100644
--- a/arch/x86/um/os-Linux/registers.c
+++ b/arch/x86/um/os-Linux/registers.c
@@ -15,6 +15,7 @@
#include <sys/uio.h>
#include <asm/sigcontext.h>
#include <linux/elf.h>
+#include <registers.h>
int have_xstate_support;
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 2497bac56066..0bc4b73a9cde 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <asm/ptrace-abi.h>
+#include <registers.h>
#include <skas.h>
extern int arch_switch_tls(struct task_struct *to);
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 1401899dee9b..289d0159b041 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -11,6 +11,7 @@
#define __FRAME_OFFSETS
#include <asm/ptrace.h>
#include <linux/uaccess.h>
+#include <registers.h>
#include <asm/ptrace-abi.h>
/*
diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S
index 62eaf8c80e04..2d991ddbcca5 100644
--- a/arch/x86/um/setjmp_32.S
+++ b/arch/x86/um/setjmp_32.S
@@ -34,7 +34,7 @@ kernel_setjmp:
movl %esi,12(%edx)
movl %edi,16(%edx)
movl %ecx,20(%edx) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S
index 1b5d40d4ff46..b46acb6a8ebd 100644
--- a/arch/x86/um/setjmp_64.S
+++ b/arch/x86/um/setjmp_64.S
@@ -33,7 +33,7 @@ kernel_setjmp:
movq %r14,40(%rdi)
movq %r15,48(%rdi)
movq %rsi,56(%rdi) # Return address
- ret
+ RET
.size kernel_setjmp,.-kernel_setjmp
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index 8a7d5e1da98e..48d6cd12f8a5 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -23,9 +23,6 @@ extern syscall_handler_t *sys_call_table[];
UPT_SYSCALL_ARG5(&regs->regs), \
UPT_SYSCALL_ARG6(&regs->regs)))
-extern long old_mmap(unsigned long addr, unsigned long len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff);
extern syscall_handler_t sys_modify_ldt;
extern syscall_handler_t sys_arch_prctl;
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 7c11c9e5d7ea..263e1d08f216 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -12,6 +12,7 @@
#include <linux/uaccess.h>
#include <asm/ucontext.h>
#include <frame_kern.h>
+#include <registers.h>
#include <skas.h>
#ifdef CONFIG_X86_32
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 0575decb5e54..89df5d89d664 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -9,8 +9,6 @@
#include <linux/cache.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -23,8 +21,6 @@
#define sys_vm86old sys_ni_syscall
#define sys_vm86 sys_ni_syscall
-#define old_mmap sys_old_mmap
-
#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 95725b5a41ac..b0b4cfd2308c 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -9,8 +9,6 @@
#include <linux/cache.h>
#include <asm/syscall.h>
-#define __NO_STUBS
-
/*
* Below you can see, in terms of #define's, the differences between the x86-64
* and the UML syscall table.
@@ -20,21 +18,6 @@
#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
-/*
- * The UML TLS problem. Note that x86_64 does not implement this, so the below
- * is needed only for the ia32 compatibility.
- */
-
-/* On UML we call it this way ("old" means it's not mmap2) */
-#define sys_mmap old_mmap
-
-#define stub_clone sys_clone
-#define stub_fork sys_fork
-#define stub_vfork sys_vfork
-#define stub_execve sys_execve
-#define stub_execveat sys_execveat
-#define stub_rt_sigreturn sys_rt_sigreturn
-
#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 58f51667e2e4..fe5323f0c42d 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -10,7 +10,9 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <registers.h>
#include <os.h>
+#include <registers.h>
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
@@ -35,7 +37,7 @@ long arch_prctl(struct task_struct *task, int option,
switch (option) {
case ARCH_SET_FS:
case ARCH_SET_GS:
- ret = restore_registers(pid, &current->thread.regs.regs);
+ ret = restore_pid_registers(pid, &current->thread.regs.regs);
if (ret)
return ret;
break;
@@ -87,3 +89,13 @@ void arch_switch_to(struct task_struct *to)
arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs);
}
+
+SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, off)
+{
+ if (off & ~PAGE_MASK)
+ return -EINVAL;
+
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
+}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 6bcd3d8ca6ac..85246dd9faa1 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -23,6 +23,7 @@ config XEN_PV
select PARAVIRT_XXL
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
+ select GUEST_PERF_EVENTS
help
Support running as a Xen PV guest.
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index e13b0b49fcdf..89dd6b1708b0 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -413,34 +413,29 @@ int pmu_apic_update(uint32_t val)
}
/* perf callbacks */
-static int xen_is_in_guest(void)
+static unsigned int xen_guest_state(void)
{
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ unsigned int state = 0;
if (!xenpmu_data) {
pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ return state;
}
if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
- return 0;
+ return state;
- return 1;
-}
-
-static int xen_is_user_mode(void)
-{
- const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ state |= PERF_GUEST_ACTIVE;
- if (!xenpmu_data) {
- pr_warn_once("%s: pmudata not initialized\n", __func__);
- return 0;
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) {
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER)
+ state |= PERF_GUEST_USER;
+ } else if (xenpmu_data->pmu.r.regs.cpl & 3) {
+ state |= PERF_GUEST_USER;
}
- if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
- return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
- else
- return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+ return state;
}
static unsigned long xen_get_guest_ip(void)
@@ -456,9 +451,8 @@ static unsigned long xen_get_guest_ip(void)
}
static struct perf_guest_info_callbacks xen_guest_cbs = {
- .is_in_guest = xen_is_in_guest,
- .is_user_mode = xen_is_user_mode,
- .get_guest_ip = xen_get_guest_ip,
+ .state = xen_guest_state,
+ .get_ip = xen_get_guest_ip,
};
/* Convert registers from Xen's format to Linux' */
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index e336f223f7f4..31b1e3477cb6 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -63,13 +63,17 @@ void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
}
if (size >= offsetof(struct dom0_vga_console_info,
- u.vesa_lfb.gbl_caps)
- + sizeof(info->u.vesa_lfb.gbl_caps))
- screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
- if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.mode_attrs)
+ sizeof(info->u.vesa_lfb.mode_attrs))
screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+
+ if (size >= offsetof(struct dom0_vga_console_info,
+ u.vesa_lfb.ext_lfb_base)
+ + sizeof(info->u.vesa_lfb.ext_lfb_base)
+ && info->u.vesa_lfb.ext_lfb_base) {
+ screen_info->ext_lfb_base = info->u.vesa_lfb.ext_lfb_base;
+ screen_info->capabilities |= VIDEO_CAPABILITY_64BIT_BASE;
+ }
break;
}
}
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 220dd9678494..e730e6200e64 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <../entry/calling.h>
.pushsection .noinstr.text, "ax"
/*
@@ -28,7 +29,7 @@
*/
SYM_FUNC_START(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- ret
+ RET
SYM_FUNC_END(xen_irq_disable_direct)
/*
@@ -57,7 +58,7 @@ SYM_FUNC_START(check_events)
pop %rcx
pop %rax
FRAME_END
- ret
+ RET
SYM_FUNC_END(check_events)
/*
@@ -83,7 +84,7 @@ SYM_FUNC_START(xen_irq_enable_direct)
call check_events
1:
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_irq_enable_direct)
/*
@@ -99,7 +100,7 @@ SYM_FUNC_START(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
- ret
+ RET
SYM_FUNC_END(xen_save_fl_direct)
SYM_FUNC_START(xen_read_cr2)
@@ -107,14 +108,14 @@ SYM_FUNC_START(xen_read_cr2)
_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
FRAME_END
- ret
+ RET
SYM_FUNC_END(xen_read_cr2_direct);
.popsection
@@ -193,6 +194,25 @@ SYM_CODE_START(xen_iret)
SYM_CODE_END(xen_iret)
/*
+ * XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
+ * also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
+ * in XEN pv would cause %rsp to move up to the top of the kernel stack and
+ * leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
+ * interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
+ * frame at the same address is useless.
+ */
+SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
+ UNWIND_HINT_REGS
+ POP_REGS
+
+ /* stackleak_erase() can work safely on the kernel stack. */
+ STACKLEAK_ERASE_NOCLOBBER
+
+ addq $8, %rsp /* skip regs->orig_ax */
+ jmp xen_iret
+SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
+
+/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 6a64496edefb..11d286529fe5 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -26,7 +26,7 @@ SYM_CODE_START(hypercall_page)
.rept (PAGE_SIZE / 32)
UNWIND_HINT_FUNC
.skip 31, 0x90
- ret
+ RET
.endr
#define HYPERCALL(n) \