aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig35
-rw-r--r--arch/x86/Makefile36
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S482
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c753
-rw-r--r--arch/x86/crypto/blake2s-glue.c150
-rw-r--r--arch/x86/crypto/blowfish_glue.c107
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S298
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S351
-rw-r--r--arch/x86/crypto/camellia.h (renamed from arch/x86/include/asm/crypto/camellia.h)24
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c198
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c216
-rw-r--r--arch/x86/crypto/camellia_glue.c145
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c287
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S84
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c207
-rw-r--r--arch/x86/crypto/des3_ede_glue.c104
-rw-r--r--arch/x86/crypto/ecb_cbc_helpers.h76
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S104
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx2.S136
-rw-r--r--arch/x86/crypto/glue_helper.c381
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S68
-rw-r--r--arch/x86/crypto/serpent-avx.h21
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S87
-rw-r--r--arch/x86/crypto/serpent-sse2.h (renamed from arch/x86/include/asm/crypto/serpent-sse2.h)0
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c185
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c215
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c150
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S80
-rw-r--r--arch/x86/crypto/twofish.h (renamed from arch/x86/include/asm/crypto/twofish.h)4
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c211
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c160
-rw-r--r--arch/x86/entry/common.c22
-rw-r--r--arch/x86/entry/entry_64.S67
-rw-r--r--arch/x86/entry/entry_64_compat.S2
-rw-r--r--arch/x86/entry/syscalls/Makefile29
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/events/core.c25
-rw-r--r--arch/x86/events/intel/core.c555
-rw-r--r--arch/x86/events/intel/ds.c133
-rw-r--r--arch/x86/events/intel/uncore.c58
-rw-r--r--arch/x86/events/intel/uncore.h5
-rw-r--r--arch/x86/events/intel/uncore_snb.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c114
-rw-r--r--arch/x86/events/perf_event.h21
-rw-r--r--arch/x86/events/probe.c7
-rw-r--r--arch/x86/events/probe.h7
-rw-r--r--arch/x86/events/rapl.c34
-rw-r--r--arch/x86/hyperv/Makefile4
-rw-r--r--arch/x86/hyperv/hv_init.c122
-rw-r--r--arch/x86/hyperv/hv_proc.c219
-rw-r--r--arch/x86/hyperv/irqdomain.c385
-rw-r--r--arch/x86/include/asm/acrn.h78
-rw-r--r--arch/x86/include/asm/apb_timer.h40
-rw-r--r--arch/x86/include/asm/compat.h11
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h19
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h118
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h42
-rw-r--r--arch/x86/include/asm/disabled-features.h3
-rw-r--r--arch/x86/include/asm/efi.h46
-rw-r--r--arch/x86/include/asm/elf.h2
-rw-r--r--arch/x86/include/asm/elfcore-compat.h31
-rw-r--r--arch/x86/include/asm/fixmap.h3
-rw-r--r--arch/x86/include/asm/fpu/api.h12
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h38
-rw-r--r--arch/x86/include/asm/idtentry.h17
-rw-r--r--arch/x86/include/asm/insn-eval.h2
-rw-r--r--arch/x86/include/asm/insn.h45
-rw-r--r--arch/x86/include/asm/intel-mid.h93
-rw-r--r--arch/x86/include/asm/intel_mid_vrtc.h10
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h2
-rw-r--r--arch/x86/include/asm/intel_scu_ipc_legacy.h91
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/irq_stack.h279
-rw-r--r--arch/x86/include/asm/irqflags.h46
-rw-r--r--arch/x86/include/asm/kfence.h64
-rw-r--r--arch/x86/include/asm/kprobes.h11
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h123
-rw-r--r--arch/x86/include/asm/kvm_host.h168
-rw-r--r--arch/x86/include/asm/mce.h22
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h19
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/orc_types.h10
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h30
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/perf_event.h24
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h2
-rw-r--r--arch/x86/include/asm/preempt.h48
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/proto.h1
-rw-r--r--arch/x86/include/asm/ptrace.h15
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/resctrl.h11
-rw-r--r--arch/x86/include/asm/smap.h10
-rw-r--r--arch/x86/include/asm/softirq_stack.h11
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/static_call.h7
-rw-r--r--arch/x86/include/asm/thermal.h13
-rw-r--r--arch/x86/include/asm/thread_info.h15
-rw-r--r--arch/x86/include/asm/tlb.h1
-rw-r--r--arch/x86/include/asm/unwind_hints.h13
-rw-r--r--arch/x86/include/asm/virtext.h25
-rw-r--r--arch/x86/include/asm/vm86.h1
-rw-r--r--arch/x86/include/asm/vmx.h1
-rw-r--r--arch/x86/include/asm/vmxfeatures.h1
-rw-r--r--arch/x86/include/asm/xen/interface.h3
-rw-r--r--arch/x86/include/asm/xen/page.h12
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/vm86.h4
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S4
-rw-r--r--arch/x86/kernel/apb_timer.c347
-rw-r--r--arch/x86/kernel/apic/apic.c37
-rw-r--r--arch/x86/kernel/apic/io_apic.c14
-rw-r--r--arch/x86/kernel/asm-offsets_64.c3
-rw-r--r--arch/x86/kernel/cpu/acrn.c16
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mce/core.c16
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c739
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c58
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c11
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c24
-rw-r--r--arch/x86/kernel/cpu/scattered.c5
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c8
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c14
-rw-r--r--arch/x86/kernel/dumpstack_64.c22
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/ftrace_64.S8
-rw-r--r--arch/x86/kernel/irq.c23
-rw-r--r--arch/x86/kernel/irq_32.c1
-rw-r--r--arch/x86/kernel/irq_64.c12
-rw-r--r--arch/x86/kernel/irqflags.S11
-rw-r--r--arch/x86/kernel/kprobes/core.c168
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c2
-rw-r--r--arch/x86/kernel/kvm.c23
-rw-r--r--arch/x86/kernel/kvmclock.c19
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/msr.c7
-rw-r--r--arch/x86/kernel/paravirt.c7
-rw-r--r--arch/x86/kernel/paravirt_patch.c10
-rw-r--r--arch/x86/kernel/pci-iommu_table.c3
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c46
-rw-r--r--arch/x86/kernel/reboot.c39
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/sev-es.c22
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/static_call.c17
-rw-r--r--arch/x86/kernel/sys_x86_64.c8
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/unwind_orc.c19
-rw-r--r--arch/x86/kernel/vm86_32.c62
-rw-r--r--arch/x86/kvm/Kconfig9
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c26
-rw-r--r--arch/x86/kvm/cpuid.h24
-rw-r--r--arch/x86/kvm/emulate.c14
-rw-r--r--arch/x86/kvm/hyperv.c436
-rw-r--r--arch/x86/kvm/hyperv.h55
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h10
-rw-r--r--arch/x86/kvm/kvm_emulate.h2
-rw-r--r--arch/x86/kvm/lapic.c72
-rw-r--r--arch/x86/kvm/lapic.h20
-rw-r--r--arch/x86/kvm/mmu.h8
-rw-r--r--arch/x86/kvm/mmu/mmu.c555
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c8
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h27
-rw-r--r--arch/x86/kvm/mmu/page_track.c8
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h22
-rw-r--r--arch/x86/kvm/mmu/spte.c2
-rw-r--r--arch/x86/kvm/mmu/spte.h33
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c68
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h23
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c619
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h35
-rw-r--r--arch/x86/kvm/mtrr.c12
-rw-r--r--arch/x86/kvm/pmu.c10
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c35
-rw-r--r--arch/x86/kvm/svm/nested.c56
-rw-r--r--arch/x86/kvm/svm/sev.c104
-rw-r--r--arch/x86/kvm/svm/svm.c345
-rw-r--r--arch/x86/kvm/svm/svm.h29
-rw-r--r--arch/x86/kvm/svm/svm_ops.h69
-rw-r--r--arch/x86/kvm/trace.h40
-rw-r--r--arch/x86/kvm/vmx/capabilities.h28
-rw-r--r--arch/x86/kvm/vmx/nested.c143
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c294
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c396
-rw-r--r--arch/x86/kvm/vmx/vmx.h58
-rw-r--r--arch/x86/kvm/x86.c993
-rw-r--r--arch/x86/kvm/x86.h12
-rw-r--r--arch/x86/kvm/xen.c721
-rw-r--r--arch/x86/kvm/xen.h138
-rw-r--r--arch/x86/lib/insn-eval.c66
-rw-r--r--arch/x86/lib/insn.c119
-rw-r--r--arch/x86/lib/retpoline.S2
-rw-r--r--arch/x86/mm/fault.c407
-rw-r--r--arch/x86/mm/init.c19
-rw-r--r--arch/x86/mm/mem_encrypt.c5
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/pat/memtype.c4
-rw-r--r--arch/x86/net/bpf_jit_comp.c426
-rw-r--r--arch/x86/net/bpf_jit_comp32.c6
-rw-r--r--arch/x86/oprofile/Makefile12
-rw-r--r--arch/x86/oprofile/backtrace.c127
-rw-r--r--arch/x86/oprofile/init.c38
-rw-r--r--arch/x86/oprofile/nmi_int.c780
-rw-r--r--arch/x86/oprofile/op_counter.h30
-rw-r--r--arch/x86/oprofile/op_model_amd.c542
-rw-r--r--arch/x86/oprofile/op_model_p4.c723
-rw-r--r--arch/x86/oprofile/op_model_ppro.c245
-rw-r--r--arch/x86/oprofile/op_x86_model.h90
-rw-r--r--arch/x86/pci/intel_mid_pci.c18
-rw-r--r--arch/x86/pci/mmconfig-shared.c6
-rw-r--r--arch/x86/platform/Makefile2
-rw-r--r--arch/x86/platform/efi/efi_64.c33
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S6
-rw-r--r--arch/x86/platform/efi/quirks.c16
-rw-r--r--arch/x86/platform/geode/alix.c19
-rw-r--r--arch/x86/platform/geode/geos.c19
-rw-r--r--arch/x86/platform/geode/net5501.c13
-rw-r--r--arch/x86/platform/goldfish/Makefile2
-rw-r--r--arch/x86/platform/goldfish/goldfish.c54
-rw-r--r--arch/x86/platform/intel-mid/Makefile7
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile33
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c101
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bma023.c16
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c101
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c39
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c81
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c37
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c77
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c32
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c39
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c78
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c44
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c43
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c50
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c82
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.c83
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.h15
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c42
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c32
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c43
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c44
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c31
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c32
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c95
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c42
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c53
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c106
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c173
-rw-r--r--arch/x86/platform/intel-mid/sfi.c543
-rw-r--r--arch/x86/platform/iris/iris.c1
-rw-r--r--arch/x86/platform/pvh/head.S2
-rw-r--r--arch/x86/platform/sfi/Makefile2
-rw-r--r--arch/x86/platform/sfi/sfi.c100
-rw-r--r--arch/x86/power/Makefile5
-rw-r--r--arch/x86/power/hibernate_asm_64.S103
-rw-r--r--arch/x86/tools/Makefile8
-rw-r--r--arch/x86/tools/insn_sanity.c4
-rw-r--r--arch/x86/tools/relocs.c16
-rw-r--r--arch/x86/um/os-Linux/task_size.c2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h33
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h36
-rw-r--r--arch/x86/um/stub_32.S17
-rw-r--r--arch/x86/um/stub_64.S5
-rw-r--r--arch/x86/um/stub_segv.c5
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/enlighten_pv.c32
-rw-r--r--arch/x86/xen/irq.c23
-rw-r--r--arch/x86/xen/p2m.c71
-rw-r--r--arch/x86/xen/setup.c25
-rw-r--r--arch/x86/xen/xen-asm.S80
-rw-r--r--arch/x86/xen/xen-head.S5
-rw-r--r--arch/x86/xen/xen-ops.h3
300 files changed, 8840 insertions, 14180 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 21f851179ff0..2792879d398e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -32,6 +32,7 @@ config X86_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
+ select ARCH_HAS_ELFCORE_COMPAT
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -96,6 +97,8 @@ config X86
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
+ select ARCH_SUPPORTS_LTO_CLANG if X86_64
+ select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
@@ -148,6 +151,7 @@ config X86
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KASAN_VMALLOC if X86_64
+ select HAVE_ARCH_KFENCE
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
@@ -168,6 +172,7 @@ config X86
select HAVE_CONTEXT_TRACKING if X86_64
select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
+ select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
@@ -187,6 +192,7 @@ config X86
select HAVE_HW_BREAKPOINT
select HAVE_IDE
select HAVE_IOREMAP_PROT
+ select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
@@ -206,7 +212,6 @@ config X86
select HAVE_MOVE_PMD
select HAVE_MOVE_PUD
select HAVE_NMI
- select HAVE_OPROFILE
select HAVE_OPTPROBES
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
@@ -220,10 +225,12 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_SOFTIRQ_ON_OWN_STACK
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
select HAVE_STACK_VALIDATION if X86_64
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION
+ select HAVE_PREEMPT_DYNAMIC
select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -444,7 +451,7 @@ config X86_X2APIC
If you don't know what to do here, say N.
config X86_MPPARSE
- bool "Enable MPS table" if ACPI || SFI
+ bool "Enable MPS table" if ACPI
default y
depends on X86_LOCAL_APIC
help
@@ -603,7 +610,6 @@ config X86_INTEL_MID
depends on PCI
depends on X86_64 || (PCI_GOANY && X86_32)
depends on X86_IO_APIC
- select SFI
select I2C
select DW_APB_TIMER
select APB_TIMER
@@ -890,19 +896,7 @@ config HPET_TIMER
config HPET_EMULATE_RTC
def_bool y
- depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
-
-config APB_TIMER
- def_bool y if X86_INTEL_MID
- prompt "Intel MID APB Timer Support" if X86_INTEL_MID
- select DW_APB_TIMER
- depends on X86_INTEL_MID && SFI
- help
- APB timer is the replacement for 8254, HPET on X86 MID platforms.
- The APBT provides a stable time base on SMP
- systems, unlike the TSC, but it is more expensive to access,
- as it is off-chip. APB timers are always running regardless of CPU
- C states, they are used as per CPU clockevent device when possible.
+ depends on HPET_TIMER && (RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
@@ -1158,10 +1152,6 @@ config X86_MCE_INJECT
If you don't know what a machine check is and you don't do kernel
QA it is safe to say n.
-config X86_THERMAL_VECTOR
- def_bool y
- depends on X86_MCE_INTEL
-
source "arch/x86/events/Kconfig"
config X86_LEGACY_VM86
@@ -2469,8 +2459,6 @@ source "kernel/power/Kconfig"
source "drivers/acpi/Kconfig"
-source "drivers/sfi/Kconfig"
-
config X86_APM_BOOT
def_bool y
depends on APM
@@ -2657,7 +2645,7 @@ config PCI_DIRECT
config PCI_MMCONFIG
bool "Support mmconfig PCI config space access" if X86_64
default y
- depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST)
+ depends on PCI && (ACPI || JAILHOUSE_GUEST)
depends on X86_64 || (PCI_GOANY || PCI_GOMMCONFIG)
config PCI_OLPC
@@ -2864,7 +2852,6 @@ config IA32_EMULATION
depends on X86_64
select ARCH_WANT_OLD_COMPAT_IPC
select BINFMT_ELF
- select COMPAT_BINFMT_ELF
select COMPAT_OLD_SIGACTION
help
Include code to run legacy 32-bit programs under a
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 30920d70b48b..2d6d5a28c3bf 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -169,6 +169,11 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1)
KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,)
endif
+ifdef CONFIG_LTO_CLANG
+KBUILD_LDFLAGS += -plugin-opt=-code-model=kernel \
+ -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
+endif
+
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
@@ -232,9 +237,6 @@ core-y += arch/x86/
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
-# must be linked after kernel/
-drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
-
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
@@ -295,16 +297,20 @@ archclean:
$(Q)$(MAKE) $(clean)=arch/x86/tools
define archhelp
- echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
- echo ' install - Install kernel using'
- echo ' (your) ~/bin/$(INSTALLKERNEL) or'
- echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
- echo ' install to $$(INSTALL_PATH) and run lilo'
- echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
- echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
- echo ' FDARGS="..." arguments for the booted kernel'
- echo ' FDINITRD=file initrd for the booted kernel'
+ echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
+ echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+ echo ' (distribution) /sbin/$(INSTALLKERNEL) or install to '
+ echo ' $$(INSTALL_PATH) and run lilo'
+ echo ''
+ echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
+ echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' FDARGS="..." arguments for the booted kernel'
+ echo ' FDINITRD=file initrd for the booted kernel'
+ echo ''
+ echo ' kvm_guest.config - Enable Kconfig items for running this kernel as a KVM guest'
+ echo ' xen.config - Enable Kconfig items for running this kernel as a Xen guest'
+
endef
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 78210793d357..9c9c4a888b1d 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -50,7 +50,6 @@ CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_BINFMT_MISC=y
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9936528e1939..b60bd2d86034 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_BINFMT_MISC=y
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a31de0c6ccde..b28e36b7c96b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -4,8 +4,6 @@
OBJECT_FILES_NON_STANDARD := y
-obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
-
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index d1436c37008b..4e3972570916 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -43,10 +43,6 @@
#ifdef __x86_64__
# constants in mergeable sections, linker can reorder and merge
-.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
-.align 16
-.Lgf128mul_x_ble_mask:
- .octa 0x00000000000000010000000000000087
.section .rodata.cst16.POLY, "aM", @progbits, 16
.align 16
POLY: .octa 0xC2000000000000000000000000000001
@@ -146,7 +142,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
#define CTR %xmm11
#define INC %xmm12
-#define GF128MUL_MASK %xmm10
+#define GF128MUL_MASK %xmm7
#ifdef __x86_64__
#define AREG %rax
@@ -2577,13 +2573,140 @@ SYM_FUNC_START(aesni_cbc_dec)
ret
SYM_FUNC_END(aesni_cbc_dec)
-#ifdef __x86_64__
+/*
+ * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_enc)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ movups (IVP), STATE
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+ movups (IVP), %xmm5
+
+ movups (INP), IN1
+ add LEN, INP
+ movups (INP), IN2
+
+ pxor IN1, STATE
+ call _aesni_enc1
+
+ pshufb %xmm5, IN2
+ pxor STATE, IN2
+ pshufb %xmm4, STATE
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movaps IN2, STATE
+ call _aesni_enc1
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_enc)
+
+/*
+ * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ * size_t len, u8 *iv)
+ */
+SYM_FUNC_START(aesni_cts_cbc_dec)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+ movups (IVP), IV
+ sub $16, LEN
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ movups (T1), %xmm4
+
+ movups (INP), STATE
+ add LEN, INP
+ movups (INP), IN1
+
+ call _aesni_dec1
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ pxor IN1, STATE
+
+ add OUTP, LEN
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+ call _aesni_dec1
+
+ pxor IV, STATE
+ movups STATE, (OUTP)
+
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+SYM_FUNC_END(aesni_cts_cbc_dec)
+
.pushsection .rodata
.align 16
+.Lcts_permute_table:
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+#ifdef __x86_64__
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#endif
.popsection
+#ifdef __x86_64__
/*
* _aesni_inc_init: internal ABI
* setup registers used by _aesni_inc
@@ -2696,6 +2819,14 @@ SYM_FUNC_START(aesni_ctr_enc)
ret
SYM_FUNC_END(aesni_ctr_enc)
+#endif
+
+.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+ .octa 0x00000000000000010000000000000087
+.previous
+
/*
* _aesni_gf128mul_x_ble: internal ABI
* Multiply in GF(2^128) for XTS IVs
@@ -2708,120 +2839,325 @@ SYM_FUNC_END(aesni_ctr_enc)
* CTR: == temporary value
*/
#define _aesni_gf128mul_x_ble() \
- pshufd $0x13, IV, CTR; \
+ pshufd $0x13, IV, KEY; \
paddq IV, IV; \
- psrad $31, CTR; \
- pand GF128MUL_MASK, CTR; \
- pxor CTR, IV;
+ psrad $31, KEY; \
+ pand GF128MUL_MASK, KEY; \
+ pxor KEY, IV;
/*
- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
- * const u8 *src, bool enc, le128 *iv)
+ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
*/
-SYM_FUNC_START(aesni_xts_crypt8)
+SYM_FUNC_START(aesni_xts_encrypt)
FRAME_BEGIN
- testb %cl, %cl
- movl $0, %ecx
- movl $240, %r10d
- leaq _aesni_enc4, %r11
- leaq _aesni_dec4, %rax
- cmovel %r10d, %ecx
- cmoveq %rax, %r11
-
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+ movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
movups (IVP), IV
mov 480(KEYP), KLEN
- addq %rcx, KEYP
+
+.Lxts_enc_loop4:
+ sub $64, LEN
+ jl .Lxts_enc_1x
movdqa IV, STATE1
- movdqu 0x00(INP), INC
- pxor INC, STATE1
+ movdqu 0x00(INP), IN
+ pxor IN, STATE1
movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
- movdqu 0x10(INP), INC
- pxor INC, STATE2
+ movdqu 0x10(INP), IN
+ pxor IN, STATE2
movdqu IV, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
- movdqu 0x20(INP), INC
- pxor INC, STATE3
+ movdqu 0x20(INP), IN
+ pxor IN, STATE3
movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
- movdqu 0x30(INP), INC
- pxor INC, STATE4
+ movdqu 0x30(INP), IN
+ pxor IN, STATE4
movdqu IV, 0x30(OUTP)
- CALL_NOSPEC r11
+ call _aesni_enc4
- movdqu 0x00(OUTP), INC
- pxor INC, STATE1
+ movdqu 0x00(OUTP), IN
+ pxor IN, STATE1
movdqu STATE1, 0x00(OUTP)
+ movdqu 0x10(OUTP), IN
+ pxor IN, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+ movdqu 0x20(OUTP), IN
+ pxor IN, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+ movdqu 0x30(OUTP), IN
+ pxor IN, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
_aesni_gf128mul_x_ble()
- movdqa IV, STATE1
- movdqu 0x40(INP), INC
- pxor INC, STATE1
- movdqu IV, 0x40(OUTP)
- movdqu 0x10(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x10(OUTP)
+ add $64, INP
+ add $64, OUTP
+ test LEN, LEN
+ jnz .Lxts_enc_loop4
+.Lxts_enc_ret_iv:
+ movups IV, (IVP)
+
+.Lxts_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
+ FRAME_END
+ ret
+
+.Lxts_enc_1x:
+ add $64, LEN
+ jz .Lxts_enc_ret_iv
+ sub $16, LEN
+ jl .Lxts_enc_cts4
+
+.Lxts_enc_loop1:
+ movdqu (INP), STATE
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
_aesni_gf128mul_x_ble()
- movdqa IV, STATE2
- movdqu 0x50(INP), INC
- pxor INC, STATE2
- movdqu IV, 0x50(OUTP)
- movdqu 0x20(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x20(OUTP)
+ test LEN, LEN
+ jz .Lxts_enc_out
+
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_enc_cts1
+
+ movdqu STATE, (OUTP)
+ add $16, OUTP
+ jmp .Lxts_enc_loop1
+
+.Lxts_enc_out:
+ movdqu STATE, (OUTP)
+ jmp .Lxts_enc_ret_iv
+
+.Lxts_enc_cts4:
+ movdqa STATE4, STATE
+ sub $16, OUTP
+
+.Lxts_enc_cts1:
+#ifndef __x86_64__
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
+#endif
+ add LEN, INP /* rewind input pointer */
+ add $16, LEN /* # bytes in final block */
+ movups (INP), IN1
+
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ add OUTP, LEN
+
+ movups (T1), %xmm4
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+
+ pxor IV, STATE
+ call _aesni_enc1
+ pxor IV, STATE
+
+ movups STATE, (OUTP)
+ jmp .Lxts_enc_ret
+SYM_FUNC_END(aesni_xts_encrypt)
+
+/*
+ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, unsigned int len, le128 *iv)
+ */
+SYM_FUNC_START(aesni_xts_decrypt)
+ FRAME_BEGIN
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+ movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+ add $240, KEYP
+
+ test $15, LEN
+ jz .Lxts_dec_loop4
+ sub $16, LEN
+
+.Lxts_dec_loop4:
+ sub $64, LEN
+ jl .Lxts_dec_1x
+
+ movdqa IV, STATE1
+ movdqu 0x00(INP), IN
+ pxor IN, STATE1
+ movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
- movdqa IV, STATE3
- movdqu 0x60(INP), INC
- pxor INC, STATE3
- movdqu IV, 0x60(OUTP)
+ movdqa IV, STATE2
+ movdqu 0x10(INP), IN
+ pxor IN, STATE2
+ movdqu IV, 0x10(OUTP)
- movdqu 0x30(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x30(OUTP)
+ _aesni_gf128mul_x_ble()
+ movdqa IV, STATE3
+ movdqu 0x20(INP), IN
+ pxor IN, STATE3
+ movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
- movdqu 0x70(INP), INC
- pxor INC, STATE4
- movdqu IV, 0x70(OUTP)
+ movdqu 0x30(INP), IN
+ pxor IN, STATE4
+ movdqu IV, 0x30(OUTP)
- _aesni_gf128mul_x_ble()
- movups IV, (IVP)
+ call _aesni_dec4
+
+ movdqu 0x00(OUTP), IN
+ pxor IN, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+ movdqu 0x10(OUTP), IN
+ pxor IN, STATE2
+ movdqu STATE2, 0x10(OUTP)
- CALL_NOSPEC r11
+ movdqu 0x20(OUTP), IN
+ pxor IN, STATE3
+ movdqu STATE3, 0x20(OUTP)
- movdqu 0x40(OUTP), INC
- pxor INC, STATE1
- movdqu STATE1, 0x40(OUTP)
+ movdqu 0x30(OUTP), IN
+ pxor IN, STATE4
+ movdqu STATE4, 0x30(OUTP)
- movdqu 0x50(OUTP), INC
- pxor INC, STATE2
- movdqu STATE2, 0x50(OUTP)
+ _aesni_gf128mul_x_ble()
- movdqu 0x60(OUTP), INC
- pxor INC, STATE3
- movdqu STATE3, 0x60(OUTP)
+ add $64, INP
+ add $64, OUTP
+ test LEN, LEN
+ jnz .Lxts_dec_loop4
- movdqu 0x70(OUTP), INC
- pxor INC, STATE4
- movdqu STATE4, 0x70(OUTP)
+.Lxts_dec_ret_iv:
+ movups IV, (IVP)
+.Lxts_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
FRAME_END
ret
-SYM_FUNC_END(aesni_xts_crypt8)
+.Lxts_dec_1x:
+ add $64, LEN
+ jz .Lxts_dec_ret_iv
+
+.Lxts_dec_loop1:
+ movdqu (INP), STATE
+
+ add $16, INP
+ sub $16, LEN
+ jl .Lxts_dec_cts1
+
+ pxor IV, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+ _aesni_gf128mul_x_ble()
+
+ test LEN, LEN
+ jz .Lxts_dec_out
+
+ movdqu STATE, (OUTP)
+ add $16, OUTP
+ jmp .Lxts_dec_loop1
+
+.Lxts_dec_out:
+ movdqu STATE, (OUTP)
+ jmp .Lxts_dec_ret_iv
+
+.Lxts_dec_cts1:
+ movdqa IV, STATE4
+ _aesni_gf128mul_x_ble()
+
+ pxor IV, STATE
+ call _aesni_dec1
+ pxor IV, STATE
+
+#ifndef __x86_64__
+ lea .Lcts_permute_table, T1
+#else
+ lea .Lcts_permute_table(%rip), T1
#endif
+ add LEN, INP /* rewind input pointer */
+ add $16, LEN /* # bytes in final block */
+ movups (INP), IN1
+
+ mov T1, IVP
+ add $32, IVP
+ add LEN, T1
+ sub LEN, IVP
+ add OUTP, LEN
+
+ movups (T1), %xmm4
+ movaps STATE, IN2
+ pshufb %xmm4, STATE
+ movups STATE, (LEN)
+
+ movups (IVP), %xmm0
+ pshufb %xmm0, IN1
+ pblendvb IN2, IN1
+ movaps IN1, STATE
+
+ pxor STATE4, STATE
+ call _aesni_dec1
+ pxor STATE4, STATE
+
+ movups STATE, (OUTP)
+ jmp .Lxts_dec_ret
+SYM_FUNC_END(aesni_xts_decrypt)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ad8a7188a2bf..2144e54a6c89 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -31,11 +31,10 @@
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
+#include <linux/jump_label.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
-#ifdef CONFIG_X86_64
-#include <asm/crypto/glue_helper.h>
-#endif
+#include <linux/static_call.h>
#define AESNI_ALIGN 16
@@ -93,62 +92,25 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
+asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
#ifdef CONFIG_X86_64
-static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-
-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, bool enc, le128 *iv);
-
-/* asmlinkage void aesni_gcm_enc()
- * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
- * struct gcm_context_data. May be uninitialized.
- * u8 *out, Ciphertext output. Encrypt in-place is allowed.
- * const u8 *in, Plaintext input
- * unsigned long plaintext_len, Length of data in bytes for encryption.
- * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
- * 16-byte aligned pointer.
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, Additional Authentication Data (AAD)
- * unsigned long aad_len, Length of AAD in bytes.
- * u8 *auth_tag, Authenticated Tag output.
- * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
- * Valid values are 16 (most likely), 12 or 8.
- */
-asmlinkage void aesni_gcm_enc(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-/* asmlinkage void aesni_gcm_dec()
- * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
- * struct gcm_context_data. May be uninitialized.
- * u8 *out, Plaintext output. Decrypt in-place is allowed.
- * const u8 *in, Ciphertext input
- * unsigned long ciphertext_len, Length of data in bytes for decryption.
- * u8 *iv, Pre-counter block j0: 12 byte IV concatenated with 0x00000001.
- * 16-byte aligned pointer.
- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
- * const u8 *aad, Additional Authentication Data (AAD)
- * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
- * to be 8 or 12 bytes
- * u8 *auth_tag, Authenticated Tag output.
- * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
- * Valid values are 16 (most likely), 12 or 8.
- */
-asmlinkage void aesni_gcm_dec(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
+DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
/* Scatter / Gather routines, with args similar to above */
asmlinkage void aesni_gcm_init(void *ctx,
@@ -167,24 +129,6 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-static const struct aesni_gcm_tfm_s {
- void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
- void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len);
- void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len);
- void (*finalize)(void *ctx, struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
-} *aesni_gcm_tfm;
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
- .init = &aesni_gcm_init,
- .enc_update = &aesni_gcm_enc_update,
- .dec_update = &aesni_gcm_dec_update,
- .finalize = &aesni_gcm_finalize,
-};
-
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
@@ -214,25 +158,6 @@ asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
- .init = &aesni_gcm_init_avx_gen2,
- .enc_update = &aesni_gcm_enc_update_avx_gen2,
- .dec_update = &aesni_gcm_dec_update_avx_gen2,
- .finalize = &aesni_gcm_finalize_avx_gen2,
-};
-
/*
* asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
@@ -256,24 +181,8 @@ asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len);
-
-static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
- .init = &aesni_gcm_init_avx_gen4,
- .enc_update = &aesni_gcm_enc_update_avx_gen4,
- .dec_update = &aesni_gcm_dec_update_avx_gen4,
- .finalize = &aesni_gcm_finalize_avx_gen4,
-};
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
@@ -374,16 +283,16 @@ static int ecb_encrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -396,16 +305,16 @@ static int ecb_decrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -418,16 +327,16 @@ static int cbc_encrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
@@ -440,36 +349,133 @@ static int cbc_decrypt(struct skcipher_request *req)
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
+ kernel_fpu_begin();
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
+ kernel_fpu_end();
nbytes &= AES_BLOCK_SIZE - 1;
err = skcipher_walk_done(&walk, nbytes);
}
- kernel_fpu_end();
return err;
}
-#ifdef CONFIG_X86_64
-static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
- struct skcipher_walk *walk)
+static int cts_cbc_encrypt(struct skcipher_request *req)
{
- u8 *ctrblk = walk->iv;
- u8 keystream[AES_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_encrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ return skcipher_walk_done(&walk, 0);
+}
+
+static int cts_cbc_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+ struct scatterlist *src = req->src, *dst = req->dst;
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
+ NULL, NULL);
+
+ if (req->cryptlen <= AES_BLOCK_SIZE) {
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+ cbc_blocks = 1;
+ }
+
+ if (cbc_blocks > 0) {
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
+
+ err = cbc_decrypt(&subreq);
+ if (err)
+ return err;
+
+ if (req->cryptlen == AES_BLOCK_SIZE)
+ return 0;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst,
+ subreq.cryptlen);
+ }
+
+ /* handle ciphertext stealing */
+ skcipher_request_set_crypt(&subreq, src, dst,
+ req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
+ req->iv);
- aesni_enc(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ aesni_cts_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
- crypto_inc(ctrblk, AES_BLOCK_SIZE);
+ return skcipher_walk_done(&walk, 0);
}
+#ifdef CONFIG_X86_64
static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv)
{
@@ -491,120 +497,36 @@ static int ctr_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ u8 keystream[AES_BLOCK_SIZE];
struct skcipher_walk walk;
unsigned int nbytes;
int err;
- err = skcipher_walk_virt(&walk, req, true);
+ err = skcipher_walk_virt(&walk, req, false);
- kernel_fpu_begin();
- while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
- nbytes &= AES_BLOCK_SIZE - 1;
+ while ((nbytes = walk.nbytes) > 0) {
+ kernel_fpu_begin();
+ if (nbytes & AES_BLOCK_MASK)
+ static_call(aesni_ctr_enc_tfm)(ctx, walk.dst.virt.addr,
+ walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK,
+ walk.iv);
+ nbytes &= ~AES_BLOCK_MASK;
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ aesni_enc(ctx, keystream, walk.iv);
+ crypto_xor_cpy(walk.dst.virt.addr + walk.nbytes - nbytes,
+ walk.src.virt.addr + walk.nbytes - nbytes,
+ keystream, nbytes);
+ crypto_inc(walk.iv, AES_BLOCK_SIZE);
+ nbytes = 0;
+ }
+ kernel_fpu_end();
err = skcipher_walk_done(&walk, nbytes);
}
- if (walk.nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
- kernel_fpu_end();
-
return err;
}
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- /* first half of xts-key is for crypt */
- err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
- key, keylen);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
- key + keylen, keylen);
-}
-
-
-static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc);
-}
-
-static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
-}
-
-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- aesni_xts_crypt8(ctx, dst, src, true, iv);
-}
-
-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- aesni_xts_crypt8(ctx, dst, src, false, iv);
-}
-
-static const struct common_glue_ctx aesni_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = 1,
-
- .funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_enc8 }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = aesni_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx aesni_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = 1,
-
- .funcs = { {
- .num_blocks = 8,
- .fn_u = { .xts = aesni_xts_dec8 }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = aesni_xts_dec }
- } }
-};
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc,
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx),
- false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc,
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx),
- true);
-}
-
static int
rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
{
@@ -681,42 +603,35 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
unsigned int assoclen, u8 *hash_subkey,
- u8 *iv, void *aes_ctx)
+ u8 *iv, void *aes_ctx, u8 *auth_tag,
+ unsigned long auth_tag_len)
{
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
- struct gcm_context_data data AESNI_ALIGN_ATTR;
- struct scatter_walk dst_sg_walk = {};
+ u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
+ struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
unsigned long left = req->cryptlen;
- unsigned long len, srclen, dstlen;
struct scatter_walk assoc_sg_walk;
- struct scatter_walk src_sg_walk;
- struct scatterlist src_start[2];
- struct scatterlist dst_start[2];
- struct scatterlist *src_sg;
- struct scatterlist *dst_sg;
- u8 *src, *dst, *assoc;
+ struct skcipher_walk walk;
+ bool do_avx, do_avx2;
u8 *assocmem = NULL;
- u8 authTag[16];
+ u8 *assoc;
+ int err;
if (!enc)
left -= auth_tag_len;
- if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
- gcm_tfm = &aesni_gcm_tfm_avx_gen2;
- if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
- gcm_tfm = &aesni_gcm_tfm_sse;
+ do_avx = (left >= AVX_GEN2_OPTSIZE);
+ do_avx2 = (left >= AVX_GEN4_OPTSIZE);
/* Linearize assoc, if not already linear */
- if (req->src->length >= assoclen && req->src->length &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE)) {
+ if (req->src->length >= assoclen && req->src->length) {
scatterwalk_start(&assoc_sg_walk, req->src);
assoc = scatterwalk_map(&assoc_sg_walk);
} else {
+ gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
+ GFP_KERNEL : GFP_ATOMIC;
+
/* assoc can be any length, so must be on heap */
- assocmem = kmalloc(assoclen, GFP_ATOMIC);
+ assocmem = kmalloc(assoclen, flags);
if (unlikely(!assocmem))
return -ENOMEM;
assoc = assocmem;
@@ -724,62 +639,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
- if (left) {
- src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
- scatterwalk_start(&src_sg_walk, src_sg);
- if (req->src != req->dst) {
- dst_sg = scatterwalk_ffwd(dst_start, req->dst,
- req->assoclen);
- scatterwalk_start(&dst_sg_walk, dst_sg);
- }
- }
-
kernel_fpu_begin();
- gcm_tfm->init(aes_ctx, &data, iv,
- hash_subkey, assoc, assoclen);
- if (req->src != req->dst) {
- while (left) {
- src = scatterwalk_map(&src_sg_walk);
- dst = scatterwalk_map(&dst_sg_walk);
- srclen = scatterwalk_clamp(&src_sg_walk, left);
- dstlen = scatterwalk_clamp(&dst_sg_walk, left);
- len = min(srclen, dstlen);
- if (len) {
- if (enc)
- gcm_tfm->enc_update(aes_ctx, &data,
- dst, src, len);
- else
- gcm_tfm->dec_update(aes_ctx, &data,
- dst, src, len);
- }
- left -= len;
-
- scatterwalk_unmap(src);
- scatterwalk_unmap(dst);
- scatterwalk_advance(&src_sg_walk, len);
- scatterwalk_advance(&dst_sg_walk, len);
- scatterwalk_done(&src_sg_walk, 0, left);
- scatterwalk_done(&dst_sg_walk, 1, left);
- }
- } else {
- while (left) {
- dst = src = scatterwalk_map(&src_sg_walk);
- len = scatterwalk_clamp(&src_sg_walk, left);
- if (len) {
- if (enc)
- gcm_tfm->enc_update(aes_ctx, &data,
- src, src, len);
- else
- gcm_tfm->dec_update(aes_ctx, &data,
- src, src, len);
- }
- left -= len;
- scatterwalk_unmap(src);
- scatterwalk_advance(&src_sg_walk, len);
- scatterwalk_done(&src_sg_walk, 1, left);
- }
- }
- gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2)
+ aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc,
+ assoclen);
+ else if (static_branch_likely(&gcm_use_avx) && do_avx)
+ aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc,
+ assoclen);
+ else
+ aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
kernel_fpu_end();
if (!assocmem)
@@ -787,24 +655,58 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
else
kfree(assocmem);
- if (!enc) {
- u8 authTagMsg[16];
+ err = enc ? skcipher_walk_aead_encrypt(&walk, req, false)
+ : skcipher_walk_aead_decrypt(&walk, req, false);
- /* Copy out original authTag */
- scatterwalk_map_and_copy(authTagMsg, req->src,
- req->assoclen + req->cryptlen -
- auth_tag_len,
- auth_tag_len, 0);
+ while (walk.nbytes > 0) {
+ kernel_fpu_begin();
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2) {
+ if (enc)
+ aesni_gcm_enc_update_avx_gen4(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ else
+ aesni_gcm_dec_update_avx_gen4(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ } else if (static_branch_likely(&gcm_use_avx) && do_avx) {
+ if (enc)
+ aesni_gcm_enc_update_avx_gen2(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ else
+ aesni_gcm_dec_update_avx_gen2(aes_ctx, data,
+ walk.dst.virt.addr,
+ walk.src.virt.addr,
+ walk.nbytes);
+ } else if (enc) {
+ aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes);
+ } else {
+ aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr,
+ walk.src.virt.addr, walk.nbytes);
+ }
+ kernel_fpu_end();
- /* Compare generated tag with passed in tag. */
- return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
- -EBADMSG : 0;
+ err = skcipher_walk_done(&walk, 0);
}
- /* Copy in the authTag */
- scatterwalk_map_and_copy(authTag, req->dst,
- req->assoclen + req->cryptlen,
- auth_tag_len, 1);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ if (static_branch_likely(&gcm_use_avx2) && do_avx2)
+ aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag,
+ auth_tag_len);
+ else if (static_branch_likely(&gcm_use_avx) && do_avx)
+ aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag,
+ auth_tag_len);
+ else
+ aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len);
+ kernel_fpu_end();
return 0;
}
@@ -812,15 +714,47 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
- aes_ctx);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 auth_tag[16];
+ int err;
+
+ err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx,
+ auth_tag, auth_tag_len);
+ if (err)
+ return err;
+
+ scatterwalk_map_and_copy(auth_tag, req->dst,
+ req->assoclen + req->cryptlen,
+ auth_tag_len, 1);
+ return 0;
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
- aes_ctx);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 auth_tag_msg[16];
+ u8 auth_tag[16];
+ int err;
+
+ err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx,
+ auth_tag, auth_tag_len);
+ if (err)
+ return err;
+
+ /* Copy out original auth_tag */
+ scatterwalk_map_and_copy(auth_tag_msg, req->src,
+ req->assoclen + req->cryptlen - auth_tag_len,
+ auth_tag_len, 0);
+
+ /* Compare generated tag with passed in tag. */
+ if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) {
+ memzero_explicit(auth_tag, sizeof(auth_tag));
+ return -EBADMSG;
+ }
+ return 0;
}
static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -828,7 +762,8 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
unsigned int i;
__be32 counter = cpu_to_be32(1);
@@ -855,7 +790,8 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
unsigned int i;
if (unlikely(req->assoclen != 16 && req->assoclen != 20))
@@ -877,6 +813,128 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
}
#endif
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int err;
+
+ err = xts_verify_key(tfm, key, keylen);
+ if (err)
+ return err;
+
+ keylen /= 2;
+
+ /* first half of xts-key is for crypt */
+ err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+ key, keylen);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+ key + keylen, keylen);
+}
+
+static int xts_crypt(struct skcipher_request *req, bool encrypt)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int tail = req->cryptlen % AES_BLOCK_SIZE;
+ struct skcipher_request subreq;
+ struct skcipher_walk walk;
+ int err;
+
+ if (req->cryptlen < AES_BLOCK_SIZE)
+ return -EINVAL;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+ int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+
+ skcipher_walk_abort(&walk);
+
+ skcipher_request_set_tfm(&subreq, tfm);
+ skcipher_request_set_callback(&subreq,
+ skcipher_request_flags(req),
+ NULL, NULL);
+ skcipher_request_set_crypt(&subreq, req->src, req->dst,
+ blocks * AES_BLOCK_SIZE, req->iv);
+ req = &subreq;
+ err = skcipher_walk_virt(&walk, req, false);
+ } else {
+ tail = 0;
+ }
+
+ kernel_fpu_begin();
+
+ /* calculate first value of T */
+ aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv);
+
+ while (walk.nbytes > 0) {
+ int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+ if (encrypt)
+ aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, walk.iv);
+ else
+ aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, walk.iv);
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+
+ if (walk.nbytes > 0)
+ kernel_fpu_begin();
+ }
+
+ if (unlikely(tail > 0 && !err)) {
+ struct scatterlist sg_src[2], sg_dst[2];
+ struct scatterlist *src, *dst;
+
+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+ if (req->dst != req->src)
+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+ req->iv);
+
+ err = skcipher_walk_virt(&walk, &subreq, false);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ if (encrypt)
+ aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ else
+ aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+ walk.dst.virt.addr, walk.src.virt.addr,
+ walk.nbytes, walk.iv);
+ kernel_fpu_end();
+
+ err = skcipher_walk_done(&walk, 0);
+ }
+ return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+ return xts_crypt(req, true);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+ return xts_crypt(req, false);
+}
+
static struct crypto_alg aesni_cipher_alg = {
.cra_name = "aes",
.cra_driver_name = "aes-aesni",
@@ -928,6 +986,23 @@ static struct skcipher_alg aesni_skciphers[] = {
.setkey = aesni_skcipher_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cts(cbc(aes))",
+ .cra_driver_name = "__cts-cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
+ },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cts_cbc_encrypt,
+ .decrypt = cts_cbc_decrypt,
#ifdef CONFIG_X86_64
}, {
.base = {
@@ -946,6 +1021,7 @@ static struct skcipher_alg aesni_skciphers[] = {
.setkey = aesni_skcipher_setkey,
.encrypt = ctr_crypt,
.decrypt = ctr_crypt,
+#endif
}, {
.base = {
.cra_name = "__xts(aes)",
@@ -959,10 +1035,10 @@ static struct skcipher_alg aesni_skciphers[] = {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
+ .walksize = 2 * AES_BLOCK_SIZE,
.setkey = xts_aesni_setkey,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
-#endif
}
};
@@ -985,7 +1061,8 @@ static int generic_gcmaes_encrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
__be32 counter = cpu_to_be32(1);
memcpy(iv, req->iv, 12);
@@ -1001,7 +1078,8 @@ static int generic_gcmaes_decrypt(struct aead_request *req)
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
void *aes_ctx = &(ctx->aes_key_expanded);
- u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
+ u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
memcpy(iv, req->iv, 12);
*((__be32 *)(iv+12)) = counter;
@@ -1066,19 +1144,18 @@ static int __init aesni_init(void)
#ifdef CONFIG_X86_64
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
+ static_branch_enable(&gcm_use_avx);
+ static_branch_enable(&gcm_use_avx2);
} else
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+ static_branch_enable(&gcm_use_avx);
} else {
pr_info("SSE version of gcm_enc/dec engaged.\n");
- aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
- aesni_ctr_enc_tfm = aesni_ctr_enc;
if (boot_cpu_has(X86_FEATURE_AVX)) {
/* optimize performance of ctr mode encryption transform */
- aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
pr_info("AES CTR mode by8 optimization enabled\n");
}
#endif
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index c025a01cf708..a40365ab301e 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -58,138 +58,40 @@ void blake2s_compress_arch(struct blake2s_state *state,
}
EXPORT_SYMBOL(blake2s_compress_arch);
-static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
- unsigned int keylen)
+static int crypto_blake2s_update_x86(struct shash_desc *desc,
+ const u8 *in, unsigned int inlen)
{
- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
- if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
- return -EINVAL;
-
- memcpy(tctx->key, key, keylen);
- tctx->keylen = keylen;
-
- return 0;
+ return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
}
-static int crypto_blake2s_init(struct shash_desc *desc)
+static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
{
- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
- struct blake2s_state *state = shash_desc_ctx(desc);
- const int outlen = crypto_shash_digestsize(desc->tfm);
-
- if (tctx->keylen)
- blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
- else
- blake2s_init(state, outlen);
-
- return 0;
+ return crypto_blake2s_final(desc, out, blake2s_compress_arch);
}
-static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
- unsigned int inlen)
-{
- struct blake2s_state *state = shash_desc_ctx(desc);
- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
-
- if (unlikely(!inlen))
- return 0;
- if (inlen > fill) {
- memcpy(state->buf + state->buflen, in, fill);
- blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
- state->buflen = 0;
- in += fill;
- inlen -= fill;
+#define BLAKE2S_ALG(name, driver_name, digest_size) \
+ { \
+ .base.cra_name = name, \
+ .base.cra_driver_name = driver_name, \
+ .base.cra_priority = 200, \
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+ .base.cra_module = THIS_MODULE, \
+ .digestsize = digest_size, \
+ .setkey = crypto_blake2s_setkey, \
+ .init = crypto_blake2s_init, \
+ .update = crypto_blake2s_update_x86, \
+ .final = crypto_blake2s_final_x86, \
+ .descsize = sizeof(struct blake2s_state), \
}
- if (inlen > BLAKE2S_BLOCK_SIZE) {
- const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
- /* Hash one less (full) block than strictly possible */
- blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
- }
- memcpy(state->buf + state->buflen, in, inlen);
- state->buflen += inlen;
-
- return 0;
-}
-
-static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
-{
- struct blake2s_state *state = shash_desc_ctx(desc);
-
- blake2s_set_lastblock(state);
- memset(state->buf + state->buflen, 0,
- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
- blake2s_compress_arch(state, state->buf, 1, state->buflen);
- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
- memcpy(out, state->h, state->outlen);
- memzero_explicit(state, sizeof(*state));
-
- return 0;
-}
-static struct shash_alg blake2s_algs[] = {{
- .base.cra_name = "blake2s-128",
- .base.cra_driver_name = "blake2s-128-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_128_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-160",
- .base.cra_driver_name = "blake2s-160-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_160_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-224",
- .base.cra_driver_name = "blake2s-224-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_224_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}, {
- .base.cra_name = "blake2s-256",
- .base.cra_driver_name = "blake2s-256-x86",
- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
- .base.cra_priority = 200,
- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
- .base.cra_module = THIS_MODULE,
-
- .digestsize = BLAKE2S_256_HASH_SIZE,
- .setkey = crypto_blake2s_setkey,
- .init = crypto_blake2s_init,
- .update = crypto_blake2s_update,
- .final = crypto_blake2s_final,
- .descsize = sizeof(struct blake2s_state),
-}};
+static struct shash_alg blake2s_algs[] = {
+ BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+ BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+};
static int __init blake2s_mod_init(void)
{
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index cedfdba69ce3..a880e0b1c255 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -6,8 +6,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <crypto/algapi.h>
@@ -247,97 +245,6 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[BF_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- blowfish_enc_blk(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, BF_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[4];
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- }
-
- /* create ctrblks for parallel encrypt */
- ctrblocks[0] = cpu_to_be64(ctrblk++);
- ctrblocks[1] = cpu_to_be64(ctrblk++);
- ctrblocks[2] = cpu_to_be64(ctrblk++);
- ctrblocks[3] = cpu_to_be64(ctrblk++);
-
- blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
- (u8 *)ctrblocks);
-
- src += 4;
- dst += 4;
- } while ((nbytes -= bsize * 4) >= bsize * 4);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- if (dst != src)
- *dst = *src;
-
- ctrblocks[0] = cpu_to_be64(ctrblk++);
-
- blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks);
-
- src += 1;
- dst += 1;
- } while ((nbytes -= bsize) >= bsize);
-
-done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
- nbytes = __ctr_crypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- if (nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-
static struct crypto_alg bf_cipher_alg = {
.cra_name = "blowfish",
.cra_driver_name = "blowfish-asm",
@@ -384,20 +291,6 @@ static struct skcipher_alg bf_skcipher_algs[] = {
.setkey = blowfish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(blowfish)",
- .base.cra_driver_name = "ctr-blowfish-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct bf_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = BF_MIN_KEY_SIZE,
- .max_keysize = BF_MAX_KEY_SIZE,
- .ivsize = BF_BLOCK_SIZE,
- .chunksize = BF_BLOCK_SIZE,
- .setkey = blowfish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index ecc0a9a905c4..e2a0e0f4bf9d 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -17,7 +17,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -589,14 +588,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.long 0x80808080
.long 0x80808080
-/* For CTR-mode IV byteswap */
-.Lbswap128_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/* For XTS mode IV generation */
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
/*
* pre-SubByte transform
*
@@ -998,292 +989,3 @@ SYM_FUNC_START(camellia_cbc_dec_16way)
FRAME_END
ret;
SYM_FUNC_END(camellia_cbc_dec_16way)
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-SYM_FUNC_START(camellia_ctr_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- subq $(16 * 16), %rsp;
- movq %rsp, %rax;
-
- vmovdqa .Lbswap128_mask, %xmm14;
-
- /* load IV and byteswap */
- vmovdqu (%rcx), %xmm0;
- vpshufb %xmm14, %xmm0, %xmm15;
- vmovdqu %xmm15, 15 * 16(%rax);
-
- vpcmpeqd %xmm15, %xmm15, %xmm15;
- vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
-
- /* construct IVs */
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm13;
- vmovdqu %xmm13, 14 * 16(%rax);
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm13;
- vmovdqu %xmm13, 13 * 16(%rax);
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm12;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm11;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm10;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm9;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm8;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm7;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm6;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm5;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm4;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm3;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm2;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vpshufb %xmm14, %xmm0, %xmm1;
- inc_le128(%xmm0, %xmm15, %xmm13);
- vmovdqa %xmm0, %xmm13;
- vpshufb %xmm14, %xmm0, %xmm0;
- inc_le128(%xmm13, %xmm15, %xmm14);
- vmovdqu %xmm13, (%rcx);
-
- /* inpack16_pre: */
- vmovq (key_table)(CTX), %xmm15;
- vpshufb .Lpack_bswap, %xmm15, %xmm15;
- vpxor %xmm0, %xmm15, %xmm0;
- vpxor %xmm1, %xmm15, %xmm1;
- vpxor %xmm2, %xmm15, %xmm2;
- vpxor %xmm3, %xmm15, %xmm3;
- vpxor %xmm4, %xmm15, %xmm4;
- vpxor %xmm5, %xmm15, %xmm5;
- vpxor %xmm6, %xmm15, %xmm6;
- vpxor %xmm7, %xmm15, %xmm7;
- vpxor %xmm8, %xmm15, %xmm8;
- vpxor %xmm9, %xmm15, %xmm9;
- vpxor %xmm10, %xmm15, %xmm10;
- vpxor %xmm11, %xmm15, %xmm11;
- vpxor %xmm12, %xmm15, %xmm12;
- vpxor 13 * 16(%rax), %xmm15, %xmm13;
- vpxor 14 * 16(%rax), %xmm15, %xmm14;
- vpxor 15 * 16(%rax), %xmm15, %xmm15;
-
- call __camellia_enc_blk16;
-
- addq $(16 * 16), %rsp;
-
- vpxor 0 * 16(%rdx), %xmm7, %xmm7;
- vpxor 1 * 16(%rdx), %xmm6, %xmm6;
- vpxor 2 * 16(%rdx), %xmm5, %xmm5;
- vpxor 3 * 16(%rdx), %xmm4, %xmm4;
- vpxor 4 * 16(%rdx), %xmm3, %xmm3;
- vpxor 5 * 16(%rdx), %xmm2, %xmm2;
- vpxor 6 * 16(%rdx), %xmm1, %xmm1;
- vpxor 7 * 16(%rdx), %xmm0, %xmm0;
- vpxor 8 * 16(%rdx), %xmm15, %xmm15;
- vpxor 9 * 16(%rdx), %xmm14, %xmm14;
- vpxor 10 * 16(%rdx), %xmm13, %xmm13;
- vpxor 11 * 16(%rdx), %xmm12, %xmm12;
- vpxor 12 * 16(%rdx), %xmm11, %xmm11;
- vpxor 13 * 16(%rdx), %xmm10, %xmm10;
- vpxor 14 * 16(%rdx), %xmm9, %xmm9;
- vpxor 15 * 16(%rdx), %xmm8, %xmm8;
- write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
- %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
- %xmm8, %rsi);
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_ctr_16way)
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-.align 8
-SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- * %r8: index for input whitening key
- * %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
- */
- FRAME_BEGIN
-
- subq $(16 * 16), %rsp;
- movq %rsp, %rax;
-
- vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
-
- /* load IV */
- vmovdqu (%rcx), %xmm0;
- vpxor 0 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 15 * 16(%rax);
- vmovdqu %xmm0, 0 * 16(%rsi);
-
- /* construct IVs */
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 1 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 14 * 16(%rax);
- vmovdqu %xmm0, 1 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 2 * 16(%rdx), %xmm0, %xmm13;
- vmovdqu %xmm0, 2 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 3 * 16(%rdx), %xmm0, %xmm12;
- vmovdqu %xmm0, 3 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 4 * 16(%rdx), %xmm0, %xmm11;
- vmovdqu %xmm0, 4 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 5 * 16(%rdx), %xmm0, %xmm10;
- vmovdqu %xmm0, 5 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 6 * 16(%rdx), %xmm0, %xmm9;
- vmovdqu %xmm0, 6 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 7 * 16(%rdx), %xmm0, %xmm8;
- vmovdqu %xmm0, 7 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 8 * 16(%rdx), %xmm0, %xmm7;
- vmovdqu %xmm0, 8 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 9 * 16(%rdx), %xmm0, %xmm6;
- vmovdqu %xmm0, 9 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 10 * 16(%rdx), %xmm0, %xmm5;
- vmovdqu %xmm0, 10 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 11 * 16(%rdx), %xmm0, %xmm4;
- vmovdqu %xmm0, 11 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 12 * 16(%rdx), %xmm0, %xmm3;
- vmovdqu %xmm0, 12 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 13 * 16(%rdx), %xmm0, %xmm2;
- vmovdqu %xmm0, 13 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 14 * 16(%rdx), %xmm0, %xmm1;
- vmovdqu %xmm0, 14 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vpxor 15 * 16(%rdx), %xmm0, %xmm15;
- vmovdqu %xmm15, 0 * 16(%rax);
- vmovdqu %xmm0, 15 * 16(%rsi);
-
- gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
- vmovdqu %xmm0, (%rcx);
-
- /* inpack16_pre: */
- vmovq (key_table)(CTX, %r8, 8), %xmm15;
- vpshufb .Lpack_bswap, %xmm15, %xmm15;
- vpxor 0 * 16(%rax), %xmm15, %xmm0;
- vpxor %xmm1, %xmm15, %xmm1;
- vpxor %xmm2, %xmm15, %xmm2;
- vpxor %xmm3, %xmm15, %xmm3;
- vpxor %xmm4, %xmm15, %xmm4;
- vpxor %xmm5, %xmm15, %xmm5;
- vpxor %xmm6, %xmm15, %xmm6;
- vpxor %xmm7, %xmm15, %xmm7;
- vpxor %xmm8, %xmm15, %xmm8;
- vpxor %xmm9, %xmm15, %xmm9;
- vpxor %xmm10, %xmm15, %xmm10;
- vpxor %xmm11, %xmm15, %xmm11;
- vpxor %xmm12, %xmm15, %xmm12;
- vpxor %xmm13, %xmm15, %xmm13;
- vpxor 14 * 16(%rax), %xmm15, %xmm14;
- vpxor 15 * 16(%rax), %xmm15, %xmm15;
-
- CALL_NOSPEC r9;
-
- addq $(16 * 16), %rsp;
-
- vpxor 0 * 16(%rsi), %xmm7, %xmm7;
- vpxor 1 * 16(%rsi), %xmm6, %xmm6;
- vpxor 2 * 16(%rsi), %xmm5, %xmm5;
- vpxor 3 * 16(%rsi), %xmm4, %xmm4;
- vpxor 4 * 16(%rsi), %xmm3, %xmm3;
- vpxor 5 * 16(%rsi), %xmm2, %xmm2;
- vpxor 6 * 16(%rsi), %xmm1, %xmm1;
- vpxor 7 * 16(%rsi), %xmm0, %xmm0;
- vpxor 8 * 16(%rsi), %xmm15, %xmm15;
- vpxor 9 * 16(%rsi), %xmm14, %xmm14;
- vpxor 10 * 16(%rsi), %xmm13, %xmm13;
- vpxor 11 * 16(%rsi), %xmm12, %xmm12;
- vpxor 12 * 16(%rsi), %xmm11, %xmm11;
- vpxor 13 * 16(%rsi), %xmm10, %xmm10;
- vpxor 14 * 16(%rsi), %xmm9, %xmm9;
- vpxor 15 * 16(%rsi), %xmm8, %xmm8;
- write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
- %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
- %xmm8, %rsi);
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_xts_crypt_16way)
-
-SYM_FUNC_START(camellia_xts_enc_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- xorl %r8d, %r8d; /* input whitening key, 0 for enc */
-
- leaq __camellia_enc_blk16, %r9;
-
- jmp camellia_xts_crypt_16way;
-SYM_FUNC_END(camellia_xts_enc_16way)
-
-SYM_FUNC_START(camellia_xts_dec_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- cmpl $16, key_length(CTX);
- movl $32, %r8d;
- movl $24, %eax;
- cmovel %eax, %r8d; /* input whitening key, last for dec */
-
- leaq __camellia_dec_blk16, %r9;
-
- jmp camellia_xts_crypt_16way;
-SYM_FUNC_END(camellia_xts_dec_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0907243c501c..782e9712a1ec 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <asm/frame.h>
-#include <asm/nospec-branch.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -625,16 +624,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.section .rodata.cst16, "aM", @progbits, 16
.align 16
-/* For CTR-mode IV byteswap */
-.Lbswap128_mask:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-
-/* For XTS mode */
-.Lxts_gf128mul_and_shl1_mask_0:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-.Lxts_gf128mul_and_shl1_mask_1:
- .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
/*
* pre-SubByte transform
*
@@ -1061,343 +1050,3 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
FRAME_END
ret;
SYM_FUNC_END(camellia_cbc_dec_32way)
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
- vpcmpeqq minus_one, x, tmp1; \
- vpcmpeqq minus_two, x, tmp2; \
- vpsubq minus_two, x, x; \
- vpor tmp2, tmp1, tmp1; \
- vpslldq $8, tmp1, tmp1; \
- vpsubq tmp1, x, x;
-
-SYM_FUNC_START(camellia_ctr_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- movq %rsp, %r10;
- cmpq %rsi, %rdx;
- je .Lctr_use_stack;
-
- /* dst can be used as temporary storage, src is not overwritten. */
- movq %rsi, %rax;
- jmp .Lctr_continue;
-
-.Lctr_use_stack:
- subq $(16 * 32), %rsp;
- movq %rsp, %rax;
-
-.Lctr_continue:
- vpcmpeqd %ymm15, %ymm15, %ymm15;
- vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
- vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
-
- /* load IV and byteswap */
- vmovdqu (%rcx), %xmm0;
- vmovdqa %xmm0, %xmm1;
- inc_le128(%xmm0, %xmm15, %xmm14);
- vbroadcasti128 .Lbswap128_mask, %ymm14;
- vinserti128 $1, %xmm0, %ymm1, %ymm0;
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 15 * 32(%rax);
-
- /* construct IVs */
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 14 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 13 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 12 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm13;
- vmovdqu %ymm13, 11 * 32(%rax);
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm10;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm9;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm8;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm7;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm6;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm5;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm4;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm3;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm2;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vpshufb %ymm14, %ymm0, %ymm1;
- add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
- vextracti128 $1, %ymm0, %xmm13;
- vpshufb %ymm14, %ymm0, %ymm0;
- inc_le128(%xmm13, %xmm15, %xmm14);
- vmovdqu %xmm13, (%rcx);
-
- /* inpack32_pre: */
- vpbroadcastq (key_table)(CTX), %ymm15;
- vpshufb .Lpack_bswap, %ymm15, %ymm15;
- vpxor %ymm0, %ymm15, %ymm0;
- vpxor %ymm1, %ymm15, %ymm1;
- vpxor %ymm2, %ymm15, %ymm2;
- vpxor %ymm3, %ymm15, %ymm3;
- vpxor %ymm4, %ymm15, %ymm4;
- vpxor %ymm5, %ymm15, %ymm5;
- vpxor %ymm6, %ymm15, %ymm6;
- vpxor %ymm7, %ymm15, %ymm7;
- vpxor %ymm8, %ymm15, %ymm8;
- vpxor %ymm9, %ymm15, %ymm9;
- vpxor %ymm10, %ymm15, %ymm10;
- vpxor 11 * 32(%rax), %ymm15, %ymm11;
- vpxor 12 * 32(%rax), %ymm15, %ymm12;
- vpxor 13 * 32(%rax), %ymm15, %ymm13;
- vpxor 14 * 32(%rax), %ymm15, %ymm14;
- vpxor 15 * 32(%rax), %ymm15, %ymm15;
-
- call __camellia_enc_blk32;
-
- movq %r10, %rsp;
-
- vpxor 0 * 32(%rdx), %ymm7, %ymm7;
- vpxor 1 * 32(%rdx), %ymm6, %ymm6;
- vpxor 2 * 32(%rdx), %ymm5, %ymm5;
- vpxor 3 * 32(%rdx), %ymm4, %ymm4;
- vpxor 4 * 32(%rdx), %ymm3, %ymm3;
- vpxor 5 * 32(%rdx), %ymm2, %ymm2;
- vpxor 6 * 32(%rdx), %ymm1, %ymm1;
- vpxor 7 * 32(%rdx), %ymm0, %ymm0;
- vpxor 8 * 32(%rdx), %ymm15, %ymm15;
- vpxor 9 * 32(%rdx), %ymm14, %ymm14;
- vpxor 10 * 32(%rdx), %ymm13, %ymm13;
- vpxor 11 * 32(%rdx), %ymm12, %ymm12;
- vpxor 12 * 32(%rdx), %ymm11, %ymm11;
- vpxor 13 * 32(%rdx), %ymm10, %ymm10;
- vpxor 14 * 32(%rdx), %ymm9, %ymm9;
- vpxor 15 * 32(%rdx), %ymm8, %ymm8;
- write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
- %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
- %ymm8, %rsi);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_ctr_32way)
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
- vpsrad $31, iv, tmp0; \
- vpaddq iv, iv, tmp1; \
- vpsllq $2, iv, iv; \
- vpshufd $0x13, tmp0, tmp0; \
- vpsrad $31, tmp1, tmp1; \
- vpand mask2, tmp0, tmp0; \
- vpshufd $0x13, tmp1, tmp1; \
- vpxor tmp0, iv, iv; \
- vpand mask1, tmp1, tmp1; \
- vpxor tmp1, iv, iv;
-
-.align 8
-SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- * %r8: index for input whitening key
- * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- subq $(16 * 32), %rsp;
- movq %rsp, %rax;
-
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
-
- /* load IV and construct second IV */
- vmovdqu (%rcx), %xmm0;
- vmovdqa %xmm0, %xmm15;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
- vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
- vinserti128 $1, %xmm0, %ymm15, %ymm0;
- vpxor 0 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 15 * 32(%rax);
- vmovdqu %ymm0, 0 * 32(%rsi);
-
- /* construct IVs */
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 1 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 14 * 32(%rax);
- vmovdqu %ymm0, 1 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 2 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 13 * 32(%rax);
- vmovdqu %ymm0, 2 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 3 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 12 * 32(%rax);
- vmovdqu %ymm0, 3 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 4 * 32(%rdx), %ymm0, %ymm11;
- vmovdqu %ymm0, 4 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 5 * 32(%rdx), %ymm0, %ymm10;
- vmovdqu %ymm0, 5 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 6 * 32(%rdx), %ymm0, %ymm9;
- vmovdqu %ymm0, 6 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 7 * 32(%rdx), %ymm0, %ymm8;
- vmovdqu %ymm0, 7 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 8 * 32(%rdx), %ymm0, %ymm7;
- vmovdqu %ymm0, 8 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 9 * 32(%rdx), %ymm0, %ymm6;
- vmovdqu %ymm0, 9 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 10 * 32(%rdx), %ymm0, %ymm5;
- vmovdqu %ymm0, 10 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 11 * 32(%rdx), %ymm0, %ymm4;
- vmovdqu %ymm0, 11 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 12 * 32(%rdx), %ymm0, %ymm3;
- vmovdqu %ymm0, 12 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 13 * 32(%rdx), %ymm0, %ymm2;
- vmovdqu %ymm0, 13 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 14 * 32(%rdx), %ymm0, %ymm1;
- vmovdqu %ymm0, 14 * 32(%rsi);
-
- gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
- vpxor 15 * 32(%rdx), %ymm0, %ymm15;
- vmovdqu %ymm15, 0 * 32(%rax);
- vmovdqu %ymm0, 15 * 32(%rsi);
-
- vextracti128 $1, %ymm0, %xmm0;
- gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
- vmovdqu %xmm0, (%rcx);
-
- /* inpack32_pre: */
- vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
- vpshufb .Lpack_bswap, %ymm15, %ymm15;
- vpxor 0 * 32(%rax), %ymm15, %ymm0;
- vpxor %ymm1, %ymm15, %ymm1;
- vpxor %ymm2, %ymm15, %ymm2;
- vpxor %ymm3, %ymm15, %ymm3;
- vpxor %ymm4, %ymm15, %ymm4;
- vpxor %ymm5, %ymm15, %ymm5;
- vpxor %ymm6, %ymm15, %ymm6;
- vpxor %ymm7, %ymm15, %ymm7;
- vpxor %ymm8, %ymm15, %ymm8;
- vpxor %ymm9, %ymm15, %ymm9;
- vpxor %ymm10, %ymm15, %ymm10;
- vpxor %ymm11, %ymm15, %ymm11;
- vpxor 12 * 32(%rax), %ymm15, %ymm12;
- vpxor 13 * 32(%rax), %ymm15, %ymm13;
- vpxor 14 * 32(%rax), %ymm15, %ymm14;
- vpxor 15 * 32(%rax), %ymm15, %ymm15;
-
- CALL_NOSPEC r9;
-
- addq $(16 * 32), %rsp;
-
- vpxor 0 * 32(%rsi), %ymm7, %ymm7;
- vpxor 1 * 32(%rsi), %ymm6, %ymm6;
- vpxor 2 * 32(%rsi), %ymm5, %ymm5;
- vpxor 3 * 32(%rsi), %ymm4, %ymm4;
- vpxor 4 * 32(%rsi), %ymm3, %ymm3;
- vpxor 5 * 32(%rsi), %ymm2, %ymm2;
- vpxor 6 * 32(%rsi), %ymm1, %ymm1;
- vpxor 7 * 32(%rsi), %ymm0, %ymm0;
- vpxor 8 * 32(%rsi), %ymm15, %ymm15;
- vpxor 9 * 32(%rsi), %ymm14, %ymm14;
- vpxor 10 * 32(%rsi), %ymm13, %ymm13;
- vpxor 11 * 32(%rsi), %ymm12, %ymm12;
- vpxor 12 * 32(%rsi), %ymm11, %ymm11;
- vpxor 13 * 32(%rsi), %ymm10, %ymm10;
- vpxor 14 * 32(%rsi), %ymm9, %ymm9;
- vpxor 15 * 32(%rsi), %ymm8, %ymm8;
- write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
- %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
- %ymm8, %rsi);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(camellia_xts_crypt_32way)
-
-SYM_FUNC_START(camellia_xts_enc_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- xorl %r8d, %r8d; /* input whitening key, 0 for enc */
-
- leaq __camellia_enc_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_enc_32way)
-
-SYM_FUNC_START(camellia_xts_dec_32way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (32 blocks)
- * %rdx: src (32 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
-
- cmpl $16, key_length(CTX);
- movl $32, %r8d;
- movl $24, %eax;
- cmovel %eax, %r8d; /* input whitening key, last for dec */
-
- leaq __camellia_dec_blk32, %r9;
-
- jmp camellia_xts_crypt_32way;
-SYM_FUNC_END(camellia_xts_dec_32way)
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/crypto/camellia.h
index f6d91861cb14..1dcea79e8f8e 100644
--- a/arch/x86/include/asm/crypto/camellia.h
+++ b/arch/x86/crypto/camellia.h
@@ -19,18 +19,10 @@ struct camellia_ctx {
u32 key_length;
};
-struct camellia_xts_ctx {
- struct camellia_ctx tweak_ctx;
- struct camellia_ctx crypt_ctx;
-};
-
extern int __camellia_setkey(struct camellia_ctx *cctx,
const unsigned char *key,
unsigned int key_len);
-extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen);
-
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
bool xor);
@@ -46,13 +38,6 @@ asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src)
{
@@ -78,14 +63,5 @@ static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst,
/* glue helpers */
extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src);
-extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-extern void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-extern void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index ccda647422d6..e7e4d64e9577 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -5,16 +5,16 @@
* Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
@@ -23,121 +23,6 @@ asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_enc_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 4,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_32way }
- }, {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_dec }
- } }
-};
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
@@ -147,45 +32,39 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_enc_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_ecb_dec_32way);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, camellia_cbc_dec_32way);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg camellia_algs[] = {
@@ -216,35 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
.setkey = camellia_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(camellia)",
- .base.cra_driver_name = "__ctr-camellia-aesni-avx2",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(camellia)",
- .base.cra_driver_name = "__xts-camellia-aesni-avx2",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .setkey = xts_camellia_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 4e5de6ef206e..c7ccf63e741e 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -5,16 +5,16 @@
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*/
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
+
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
/* 16-way parallel cipher functions (avx/aes-ni) */
@@ -27,120 +27,6 @@ EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
-asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_ctr_16way);
-
-asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
-
-asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
-
-void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk);
-}
-EXPORT_SYMBOL_GPL(camellia_xts_enc);
-
-void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk);
-}
-EXPORT_SYMBOL_GPL(camellia_xts_dec);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_enc_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = camellia_ctr_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_enc_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = camellia_ecb_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = camellia_cbc_dec_16way }
- }, {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = camellia_xts_dec_16way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = camellia_xts_dec }
- } }
-};
-
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
@@ -149,65 +35,36 @@ static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
-}
-
-int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-EXPORT_SYMBOL_GPL(xts_camellia_setkey);
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg camellia_algs[] = {
@@ -238,36 +95,7 @@ static struct skcipher_alg camellia_algs[] = {
.setkey = camellia_setkey,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(camellia)",
- .base.cra_driver_name = "__ctr-camellia-aesni",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(camellia)",
- .base.cra_driver_name = "__xts-camellia-aesni",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct camellia_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = 2 * CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .setkey = xts_camellia_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
- },
+ }
};
static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 242c056e5fa8..66c435ba9d3d 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -14,8 +14,9 @@
#include <linux/module.h>
#include <linux/types.h>
#include <crypto/algapi.h>
-#include <asm/crypto/camellia.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "camellia.h"
+#include "ecb_cbc_helpers.h"
/* regular block cipher functions */
asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
@@ -1262,129 +1263,47 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
return camellia_setkey(&tfm->base, key, key_len);
}
-void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s)
+void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src)
{
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- u128 iv = *src;
-
- camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
+ u8 buf[CAMELLIA_BLOCK_SIZE];
+ const u8 *iv = src;
- u128_xor(&dst[1], &dst[1], &iv);
+ if (dst == src)
+ iv = memcpy(buf, iv, sizeof(buf));
+ camellia_dec_blk_2way(ctx, dst, src);
+ crypto_xor(dst + CAMELLIA_BLOCK_SIZE, iv, CAMELLIA_BLOCK_SIZE);
}
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
-void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src)
- *dst = *src;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
-
-void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblks[2];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- }
-
- le128_to_be128(&ctrblks[0], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[1], iv);
- le128_inc(iv);
-
- camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
-
-static const struct common_glue_ctx camellia_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_enc_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ctr = camellia_crypt_ctr_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = camellia_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .ecb = camellia_dec_blk_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = camellia_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx camellia_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 2,
- .fn_u = { .cbc = camellia_decrypt_cbc_2way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = camellia_dec_blk }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_enc, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_enc_blk_2way);
+ ECB_BLOCK(1, camellia_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&camellia_dec, req);
+ ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ ECB_BLOCK(2, camellia_dec_blk_2way);
+ ECB_BLOCK(1, camellia_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(camellia_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&camellia_ctr, req);
+ CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way);
+ CBC_DEC_BLOCK(1, camellia_dec_blk);
+ CBC_WALK_END();
}
static struct crypto_alg camellia_cipher_alg = {
@@ -1433,20 +1352,6 @@ static struct skcipher_alg camellia_skcipher_algs[] = {
.setkey = camellia_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(camellia)",
- .base.cra_driver_name = "ctr-camellia-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct camellia_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAMELLIA_MIN_KEY_SIZE,
- .max_keysize = CAMELLIA_MAX_KEY_SIZE,
- .ivsize = CAMELLIA_BLOCK_SIZE,
- .chunksize = CAMELLIA_BLOCK_SIZE,
- .setkey = camellia_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 384ccb00f9e1..3976a87f92ad 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -6,7 +6,6 @@
* <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
*/
-#include <asm/crypto/glue_helper.h>
#include <crypto/algapi.h>
#include <crypto/cast5.h>
#include <crypto/internal/simd.h>
@@ -15,6 +14,8 @@
#include <linux/module.h>
#include <linux/types.h>
+#include "ecb_cbc_helpers.h"
+
#define CAST5_PARALLEL_BLOCKS 16
asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
@@ -23,8 +24,6 @@ asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
const u8 *src);
-asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
- __be64 *iv);
static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
@@ -32,272 +31,35 @@ static int cast5_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
return cast5_setkey(&tfm->base, key, keylen);
}
-static inline bool cast5_fpu_begin(bool fpu_enabled, struct skcipher_walk *walk,
- unsigned int nbytes)
-{
- return glue_fpu_begin(CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS,
- walk, fpu_enabled, nbytes);
-}
-
-static inline void cast5_fpu_end(bool fpu_enabled)
-{
- return glue_fpu_end(fpu_enabled);
-}
-
-static int ecb_crypt(struct skcipher_request *req, bool enc)
-{
- bool fpu_enabled = false;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes;
- void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u8 *wsrc = walk.src.virt.addr;
- u8 *wdst = walk.dst.virt.addr;
-
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize * CAST5_PARALLEL_BLOCKS;
- wdst += bsize * CAST5_PARALLEL_BLOCKS;
- nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
-
- if (nbytes < bsize)
- goto done;
- }
-
- fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
-
- /* Handle leftovers */
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize;
- wdst += bsize;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
- return err;
-}
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, true);
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_enc_16way);
+ ECB_BLOCK(1, __cast5_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, false);
+ ECB_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_ecb_dec_16way);
+ ECB_BLOCK(1, __cast5_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u64 *src = (u64 *)walk.src.virt.addr;
- u64 *dst = (u64 *)walk.dst.virt.addr;
- u64 *iv = (u64 *)walk.iv;
-
- do {
- *dst = *src ^ *iv;
- __cast5_encrypt(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
- src++;
- dst++;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u64 *)walk.iv = *iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static unsigned int __cbc_decrypt(struct cast5_ctx *ctx,
- struct skcipher_walk *walk)
-{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 last_iv;
-
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- do {
- nbytes -= bsize * (CAST5_PARALLEL_BLOCKS - 1);
- src -= CAST5_PARALLEL_BLOCKS - 1;
- dst -= CAST5_PARALLEL_BLOCKS - 1;
-
- cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- goto done;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
- }
-
- /* Handle leftovers */
- for (;;) {
- __cast5_decrypt(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- break;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- }
-
-done:
- *dst ^= *(u64 *)walk->iv;
- *(u64 *)walk->iv = last_iv;
-
- return nbytes;
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast5_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
- nbytes = __cbc_decrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
- return err;
-}
-
-static void ctr_crypt_final(struct skcipher_walk *walk, struct cast5_ctx *ctx)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[CAST5_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- __cast5_encrypt(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, CAST5_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct skcipher_walk *walk,
- struct cast5_ctx *ctx)
-{
- const unsigned int bsize = CAST5_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
-
- /* Process multi-block batch */
- if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
- do {
- cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
- (__be64 *)walk->iv);
-
- src += CAST5_PARALLEL_BLOCKS;
- dst += CAST5_PARALLEL_BLOCKS;
- nbytes -= bsize * CAST5_PARALLEL_BLOCKS;
- } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- u64 ctrblk;
-
- if (dst != src)
- *dst = *src;
-
- ctrblk = *(u64 *)walk->iv;
- be64_add_cpu((__be64 *)walk->iv, 1);
-
- __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- *dst ^= ctrblk;
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm);
- bool fpu_enabled = false;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) {
- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes);
- nbytes = __ctr_crypt(&walk, ctx);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- cast5_fpu_end(fpu_enabled);
-
- if (walk.nbytes) {
- ctr_crypt_final(&walk, ctx);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
+ CBC_WALK_START(req, CAST5_BLOCK_SIZE, CAST5_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST5_PARALLEL_BLOCKS, cast5_cbc_dec_16way);
+ CBC_DEC_BLOCK(1, __cast5_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg cast5_algs[] = {
@@ -328,21 +90,6 @@ static struct skcipher_alg cast5_algs[] = {
.setkey = cast5_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(cast5)",
- .base.cra_driver_name = "__ctr-cast5-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct cast5_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAST5_MIN_KEY_SIZE,
- .max_keysize = CAST5_MAX_KEY_SIZE,
- .ivsize = CAST5_BLOCK_SIZE,
- .chunksize = CAST5_BLOCK_SIZE,
- .setkey = cast5_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 932a3ce32a88..fbddcecc3e3f 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -212,8 +212,6 @@
.section .rodata.cst16, "aM", @progbits, 16
.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.Lbswap_mask:
.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
.Lbswap128_mask:
@@ -412,85 +410,3 @@ SYM_FUNC_START(cast6_cbc_dec_8way)
FRAME_END
ret;
SYM_FUNC_END(cast6_cbc_dec_8way)
-
-SYM_FUNC_START(cast6_ctr_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
- pushq %r12;
- pushq %r15
-
- movq %rdi, CTX;
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RX, RKR, RKM);
-
- call __cast6_enc_blk8;
-
- store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- popq %r12;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_ctr_8way)
-
-SYM_FUNC_START(cast6_xts_enc_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
- pushq %r15;
-
- movq %rdi, CTX
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
-
- call __cast6_enc_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_xts_enc_8way)
-
-SYM_FUNC_START(cast6_xts_dec_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
- pushq %r15;
-
- movq %rdi, CTX
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
-
- call __cast6_dec_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- popq %r15;
- FRAME_END
- ret;
-SYM_FUNC_END(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 48e0f37796fa..7e2aea372349 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -15,8 +15,8 @@
#include <crypto/algapi.h>
#include <crypto/cast6.h>
#include <crypto/internal/simd.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "ecb_cbc_helpers.h"
#define CAST6_PARALLEL_BLOCKS 8
@@ -24,13 +24,6 @@ asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -38,172 +31,35 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
return cast6_setkey(&tfm->base, key, keylen);
}
-static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt);
-}
-
-static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt);
-}
-
-static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-
-static const struct common_glue_ctx cast6_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = cast6_ecb_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __cast6_encrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ctr = cast6_ctr_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = cast6_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx cast6_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = cast6_xts_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = cast6_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = cast6_ecb_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __cast6_decrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .cbc = cast6_cbc_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __cast6_decrypt }
- } }
-};
-
-static const struct common_glue_ctx cast6_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = cast6_xts_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = cast6_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&cast6_enc, req);
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_enc_8way);
+ ECB_BLOCK(1, __cast6_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&cast6_dec, req);
+ ECB_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ ECB_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_ecb_dec_8way);
+ ECB_BLOCK(1, __cast6_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req);
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__cast6_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&cast6_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&cast6_ctr, req);
-}
-
-struct cast6_xts_ctx {
- struct cast6_ctx tweak_ctx;
- struct cast6_ctx crypt_ctx;
-};
-
-static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, CAST6_BLOCK_SIZE, CAST6_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(CAST6_PARALLEL_BLOCKS, cast6_cbc_dec_8way);
+ CBC_DEC_BLOCK(1, __cast6_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg cast6_algs[] = {
@@ -234,35 +90,6 @@ static struct skcipher_alg cast6_algs[] = {
.setkey = cast6_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(cast6)",
- .base.cra_driver_name = "__ctr-cast6-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct cast6_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = CAST6_MIN_KEY_SIZE,
- .max_keysize = CAST6_MAX_KEY_SIZE,
- .ivsize = CAST6_BLOCK_SIZE,
- .chunksize = CAST6_BLOCK_SIZE,
- .setkey = cast6_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(cast6)",
- .base.cra_driver_name = "__xts-cast6-avx",
- .base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = CAST6_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct cast6_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * CAST6_MIN_KEY_SIZE,
- .max_keysize = 2 * CAST6_MAX_KEY_SIZE,
- .ivsize = CAST6_BLOCK_SIZE,
- .setkey = xts_cast6_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 89830e531350..e7cb68a3db3b 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -6,8 +6,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <crypto/algapi.h>
@@ -253,94 +251,6 @@ static int cbc_decrypt(struct skcipher_request *req)
return err;
}
-static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
- struct skcipher_walk *walk)
-{
- u8 *ctrblk = walk->iv;
- u8 keystream[DES3_EDE_BLOCK_SIZE];
- u8 *src = walk->src.virt.addr;
- u8 *dst = walk->dst.virt.addr;
- unsigned int nbytes = walk->nbytes;
-
- des3_ede_enc_blk(ctx, keystream, ctrblk);
- crypto_xor_cpy(dst, keystream, src, nbytes);
-
- crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
-}
-
-static unsigned int __ctr_crypt(struct des3_ede_x86_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = DES3_EDE_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- __be64 *src = (__be64 *)walk->src.virt.addr;
- __be64 *dst = (__be64 *)walk->dst.virt.addr;
- u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
- __be64 ctrblocks[3];
-
- /* Process four block batch */
- if (nbytes >= bsize * 3) {
- do {
- /* create ctrblks for parallel encrypt */
- ctrblocks[0] = cpu_to_be64(ctrblk++);
- ctrblocks[1] = cpu_to_be64(ctrblk++);
- ctrblocks[2] = cpu_to_be64(ctrblk++);
-
- des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
- (u8 *)ctrblocks);
-
- dst[0] = src[0] ^ ctrblocks[0];
- dst[1] = src[1] ^ ctrblocks[1];
- dst[2] = src[2] ^ ctrblocks[2];
-
- src += 3;
- dst += 3;
- } while ((nbytes -= bsize * 3) >= bsize * 3);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- ctrblocks[0] = cpu_to_be64(ctrblk++);
-
- des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
-
- dst[0] = src[0] ^ ctrblocks[0];
-
- src += 1;
- dst += 1;
- } while ((nbytes -= bsize) >= bsize);
-
-done:
- *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
- return nbytes;
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
- nbytes = __ctr_crypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- if (nbytes) {
- ctr_crypt_final(ctx, &walk);
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-
static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
@@ -428,20 +338,6 @@ static struct skcipher_alg des3_ede_skciphers[] = {
.setkey = des3_ede_x86_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(des3_ede)",
- .base.cra_driver_name = "ctr-des3_ede-asm",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = DES3_EDE_KEY_SIZE,
- .max_keysize = DES3_EDE_KEY_SIZE,
- .ivsize = DES3_EDE_BLOCK_SIZE,
- .chunksize = DES3_EDE_BLOCK_SIZE,
- .setkey = des3_ede_x86_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
}
};
diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
new file mode 100644
index 000000000000..eaa15c7b29d6
--- /dev/null
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRYPTO_ECB_CBC_HELPER_H
+#define _CRYPTO_ECB_CBC_HELPER_H
+
+#include <crypto/internal/skcipher.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Mode helpers to instantiate parameterized skcipher ECB/CBC modes without
+ * having to rely on indirect calls and retpolines.
+ */
+
+#define ECB_WALK_START(req, bsize, fpu_blocks) do { \
+ void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \
+ const int __bsize = (bsize); \
+ struct skcipher_walk walk; \
+ int err = skcipher_walk_virt(&walk, (req), false); \
+ while (walk.nbytes > 0) { \
+ unsigned int nbytes = walk.nbytes; \
+ bool do_fpu = (fpu_blocks) != -1 && \
+ nbytes >= (fpu_blocks) * __bsize; \
+ const u8 *src = walk.src.virt.addr; \
+ u8 *dst = walk.dst.virt.addr; \
+ u8 __maybe_unused buf[(bsize)]; \
+ if (do_fpu) kernel_fpu_begin()
+
+#define CBC_WALK_START(req, bsize, fpu_blocks) \
+ ECB_WALK_START(req, bsize, fpu_blocks)
+
+#define ECB_WALK_ADVANCE(blocks) do { \
+ dst += (blocks) * __bsize; \
+ src += (blocks) * __bsize; \
+ nbytes -= (blocks) * __bsize; \
+} while (0)
+
+#define ECB_BLOCK(blocks, func) do { \
+ while (nbytes >= (blocks) * __bsize) { \
+ (func)(ctx, dst, src); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define CBC_ENC_BLOCK(func) do { \
+ const u8 *__iv = walk.iv; \
+ while (nbytes >= __bsize) { \
+ crypto_xor_cpy(dst, src, __iv, __bsize); \
+ (func)(ctx, dst, dst); \
+ __iv = dst; \
+ ECB_WALK_ADVANCE(1); \
+ } \
+ memcpy(walk.iv, __iv, __bsize); \
+} while (0)
+
+#define CBC_DEC_BLOCK(blocks, func) do { \
+ while (nbytes >= (blocks) * __bsize) { \
+ const u8 *__iv = src + ((blocks) - 1) * __bsize; \
+ if (dst == src) \
+ __iv = memcpy(buf, __iv, __bsize); \
+ (func)(ctx, dst, src); \
+ crypto_xor(dst, walk.iv, __bsize); \
+ memcpy(walk.iv, __iv, __bsize); \
+ ECB_WALK_ADVANCE(blocks); \
+ } \
+} while (0)
+
+#define ECB_WALK_END() \
+ if (do_fpu) kernel_fpu_end(); \
+ err = skcipher_walk_done(&walk, nbytes); \
+ } \
+ return err; \
+} while (0)
+
+#define CBC_WALK_END() ECB_WALK_END()
+
+#endif
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
index d08fc575ef7f..3da385271227 100644
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -34,107 +34,3 @@
vpxor (5*16)(src), x6, x6; \
vpxor (6*16)(src), x7, x7; \
store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
- vpcmpeqd t0, t0, t0; \
- vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
- vmovdqa bswap, t1; \
- \
- /* load IV and byteswap */ \
- vmovdqu (iv), x7; \
- vpshufb t1, x7, x0; \
- \
- /* construct IVs */ \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x1; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x2; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x3; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x4; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x5; \
- inc_le128(x7, t0, t2); \
- vpshufb t1, x7, x6; \
- inc_le128(x7, t0, t2); \
- vmovdqa x7, t2; \
- vpshufb t1, x7, x7; \
- inc_le128(t2, t0, t1); \
- vmovdqu t2, (iv);
-
-#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*16)(src), x0, x0; \
- vpxor (1*16)(src), x1, x1; \
- vpxor (2*16)(src), x2, x2; \
- vpxor (3*16)(src), x3, x3; \
- vpxor (4*16)(src), x4, x4; \
- vpxor (5*16)(src), x5, x5; \
- vpxor (6*16)(src), x6, x6; \
- vpxor (7*16)(src), x7, x7; \
- store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
- t1, xts_gf128mul_and_shl1_mask) \
- vmovdqa xts_gf128mul_and_shl1_mask, t0; \
- \
- /* load IV */ \
- vmovdqu (iv), tiv; \
- vpxor (0*16)(src), tiv, x0; \
- vmovdqu tiv, (0*16)(dst); \
- \
- /* construct and store IVs, also xor with source */ \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (1*16)(src), tiv, x1; \
- vmovdqu tiv, (1*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (2*16)(src), tiv, x2; \
- vmovdqu tiv, (2*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (3*16)(src), tiv, x3; \
- vmovdqu tiv, (3*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (4*16)(src), tiv, x4; \
- vmovdqu tiv, (4*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (5*16)(src), tiv, x5; \
- vmovdqu tiv, (5*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (6*16)(src), tiv, x6; \
- vmovdqu tiv, (6*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vpxor (7*16)(src), tiv, x7; \
- vmovdqu tiv, (7*16)(dst); \
- \
- gf128mul_x_ble(tiv, t0, t1); \
- vmovdqu tiv, (iv);
-
-#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*16)(dst), x0, x0; \
- vpxor (1*16)(dst), x1, x1; \
- vpxor (2*16)(dst), x2, x2; \
- vpxor (3*16)(dst), x3, x3; \
- vpxor (4*16)(dst), x4, x4; \
- vpxor (5*16)(dst), x5, x5; \
- vpxor (6*16)(dst), x6, x6; \
- vpxor (7*16)(dst), x7, x7; \
- store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper-asm-avx2.S b/arch/x86/crypto/glue_helper-asm-avx2.S
index d84508c85c13..c77e9049431f 100644
--- a/arch/x86/crypto/glue_helper-asm-avx2.S
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -37,139 +37,3 @@
vpxor (5*32+16)(src), x6, x6; \
vpxor (6*32+16)(src), x7, x7; \
store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define inc_le128(x, minus_one, tmp) \
- vpcmpeqq minus_one, x, tmp; \
- vpsubq minus_one, x, x; \
- vpslldq $8, tmp, tmp; \
- vpsubq tmp, x, x;
-
-#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
- vpcmpeqq minus_one, x, tmp1; \
- vpcmpeqq minus_two, x, tmp2; \
- vpsubq minus_two, x, x; \
- vpor tmp2, tmp1, tmp1; \
- vpslldq $8, tmp1, tmp1; \
- vpsubq tmp1, x, x;
-
-#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
- t1x, t2, t2x, t3, t3x, t4, t5) \
- vpcmpeqd t0, t0, t0; \
- vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
- vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
- \
- /* load IV and byteswap */ \
- vmovdqu (iv), t2x; \
- vmovdqa t2x, t3x; \
- inc_le128(t2x, t0x, t1x); \
- vbroadcasti128 bswap, t1; \
- vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
- vpshufb t1, t2, x0; \
- \
- /* construct IVs */ \
- add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
- vpshufb t1, t2, x1; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x2; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x3; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x4; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x5; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x6; \
- add2_le128(t2, t0, t4, t3, t5); \
- vpshufb t1, t2, x7; \
- vextracti128 $1, t2, t2x; \
- inc_le128(t2x, t0x, t3x); \
- vmovdqu t2x, (iv);
-
-#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*32)(src), x0, x0; \
- vpxor (1*32)(src), x1, x1; \
- vpxor (2*32)(src), x2, x2; \
- vpxor (3*32)(src), x3, x3; \
- vpxor (4*32)(src), x4, x4; \
- vpxor (5*32)(src), x5, x5; \
- vpxor (6*32)(src), x6, x6; \
- vpxor (7*32)(src), x7, x7; \
- store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
-
-#define gf128mul_x_ble(iv, mask, tmp) \
- vpsrad $31, iv, tmp; \
- vpaddq iv, iv, iv; \
- vpshufd $0x13, tmp, tmp; \
- vpand mask, tmp, tmp; \
- vpxor tmp, iv, iv;
-
-#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
- vpsrad $31, iv, tmp0; \
- vpaddq iv, iv, tmp1; \
- vpsllq $2, iv, iv; \
- vpshufd $0x13, tmp0, tmp0; \
- vpsrad $31, tmp1, tmp1; \
- vpand mask2, tmp0, tmp0; \
- vpshufd $0x13, tmp1, tmp1; \
- vpxor tmp0, iv, iv; \
- vpand mask1, tmp1, tmp1; \
- vpxor tmp1, iv, iv;
-
-#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
- tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
- xts_gf128mul_and_shl1_mask_0, \
- xts_gf128mul_and_shl1_mask_1) \
- vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
- \
- /* load IV and construct second IV */ \
- vmovdqu (iv), tivx; \
- vmovdqa tivx, t0x; \
- gf128mul_x_ble(tivx, t1x, t2x); \
- vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
- vinserti128 $1, tivx, t0, tiv; \
- vpxor (0*32)(src), tiv, x0; \
- vmovdqu tiv, (0*32)(dst); \
- \
- /* construct and store IVs, also xor with source */ \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (1*32)(src), tiv, x1; \
- vmovdqu tiv, (1*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (2*32)(src), tiv, x2; \
- vmovdqu tiv, (2*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (3*32)(src), tiv, x3; \
- vmovdqu tiv, (3*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (4*32)(src), tiv, x4; \
- vmovdqu tiv, (4*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (5*32)(src), tiv, x5; \
- vmovdqu tiv, (5*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (6*32)(src), tiv, x6; \
- vmovdqu tiv, (6*32)(dst); \
- \
- gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
- vpxor (7*32)(src), tiv, x7; \
- vmovdqu tiv, (7*32)(dst); \
- \
- vextracti128 $1, tiv, tivx; \
- gf128mul_x_ble(tivx, t1x, t2x); \
- vmovdqu tivx, (iv);
-
-#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
- vpxor (0*32)(dst), x0, x0; \
- vpxor (1*32)(dst), x1, x1; \
- vpxor (2*32)(dst), x2, x2; \
- vpxor (3*32)(dst), x3, x3; \
- vpxor (4*32)(dst), x4, x4; \
- vpxor (5*32)(dst), x5, x5; \
- vpxor (6*32)(dst), x6, x6; \
- vpxor (7*32)(dst), x7, x7; \
- store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
deleted file mode 100644
index d3d91a0abf88..000000000000
--- a/arch/x86/crypto/glue_helper.c
+++ /dev/null
@@ -1,381 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Shared glue code for 128bit block ciphers
- *
- * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
- * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
- */
-
-#include <linux/module.h>
-#include <crypto/b128ops.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-
-int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u8 *src = walk.src.virt.addr;
- u8 *dst = walk.dst.virt.addr;
- unsigned int func_bytes;
- unsigned int i;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
- for (i = 0; i < gctx->num_funcs; i++) {
- func_bytes = bsize * gctx->funcs[i].num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- gctx->funcs[i].fn_u.ecb(ctx, dst, src);
- src += func_bytes;
- dst += func_bytes;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- break;
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_ecb_req_128bit);
-
-int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u128 *src = (u128 *)walk.src.virt.addr;
- u128 *dst = (u128 *)walk.dst.virt.addr;
- u128 *iv = (u128 *)walk.iv;
-
- do {
- u128_xor(dst, src, iv);
- fn(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
- src++;
- dst++;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u128 *)walk.iv = *iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_cbc_encrypt_req_128bit);
-
-int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- const u128 *src = walk.src.virt.addr;
- u128 *dst = walk.dst.virt.addr;
- unsigned int func_bytes, num_blocks;
- unsigned int i;
- u128 last_iv;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- src -= num_blocks - 1;
- dst -= num_blocks - 1;
-
- gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst,
- (const u8 *)src);
-
- nbytes -= func_bytes;
- if (nbytes < bsize)
- goto done;
-
- u128_xor(dst, dst, --src);
- dst--;
- } while (nbytes >= func_bytes);
- }
-done:
- u128_xor(dst, dst, (u128 *)walk.iv);
- *(u128 *)walk.iv = last_iv;
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit);
-
-int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req)
-{
- void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req));
- const unsigned int bsize = 128 / 8;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes) >= bsize) {
- const u128 *src = walk.src.virt.addr;
- u128 *dst = walk.dst.virt.addr;
- unsigned int func_bytes, num_blocks;
- unsigned int i;
- le128 ctrblk;
-
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled, nbytes);
-
- be128_to_le128(&ctrblk, (be128 *)walk.iv);
-
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes < func_bytes)
- continue;
-
- /* Process multi-block batch */
- do {
- gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst,
- (const u8 *)src,
- &ctrblk);
- src += num_blocks;
- dst += num_blocks;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- break;
- }
-
- le128_to_be128((be128 *)walk.iv, &ctrblk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- glue_fpu_end(fpu_enabled);
-
- if (nbytes) {
- le128 ctrblk;
- u128 tmp;
-
- be128_to_le128(&ctrblk, (be128 *)walk.iv);
- memcpy(&tmp, walk.src.virt.addr, nbytes);
- gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp,
- (const u8 *)&tmp,
- &ctrblk);
- memcpy(walk.dst.virt.addr, &tmp, nbytes);
- le128_to_be128((be128 *)walk.iv, &ctrblk);
-
- err = skcipher_walk_done(&walk, 0);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_ctr_req_128bit);
-
-static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
- void *ctx,
- struct skcipher_walk *walk)
-{
- const unsigned int bsize = 128 / 8;
- unsigned int nbytes = walk->nbytes;
- u128 *src = walk->src.virt.addr;
- u128 *dst = walk->dst.virt.addr;
- unsigned int num_blocks, func_bytes;
- unsigned int i;
-
- /* Process multi-block batch */
- for (i = 0; i < gctx->num_funcs; i++) {
- num_blocks = gctx->funcs[i].num_blocks;
- func_bytes = bsize * num_blocks;
-
- if (nbytes >= func_bytes) {
- do {
- gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst,
- (const u8 *)src,
- walk->iv);
-
- src += num_blocks;
- dst += num_blocks;
- nbytes -= func_bytes;
- } while (nbytes >= func_bytes);
-
- if (nbytes < bsize)
- goto done;
- }
- }
-
-done:
- return nbytes;
-}
-
-int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req,
- common_glue_func_t tweak_fn, void *tweak_ctx,
- void *crypt_ctx, bool decrypt)
-{
- const bool cts = (req->cryptlen % XTS_BLOCK_SIZE);
- const unsigned int bsize = 128 / 8;
- struct skcipher_request subreq;
- struct skcipher_walk walk;
- bool fpu_enabled = false;
- unsigned int nbytes, tail;
- int err;
-
- if (req->cryptlen < XTS_BLOCK_SIZE)
- return -EINVAL;
-
- if (unlikely(cts)) {
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-
- tail = req->cryptlen % XTS_BLOCK_SIZE + XTS_BLOCK_SIZE;
-
- skcipher_request_set_tfm(&subreq, tfm);
- skcipher_request_set_callback(&subreq,
- crypto_skcipher_get_flags(tfm),
- NULL, NULL);
- skcipher_request_set_crypt(&subreq, req->src, req->dst,
- req->cryptlen - tail, req->iv);
- req = &subreq;
- }
-
- err = skcipher_walk_virt(&walk, req, false);
- nbytes = walk.nbytes;
- if (err)
- return err;
-
- /* set minimum length to bsize, for tweak_fn */
- fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
- &walk, fpu_enabled,
- nbytes < bsize ? bsize : nbytes);
-
- /* calculate first value of T */
- tweak_fn(tweak_ctx, walk.iv, walk.iv);
-
- while (nbytes) {
- nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
-
- err = skcipher_walk_done(&walk, nbytes);
- nbytes = walk.nbytes;
- }
-
- if (unlikely(cts)) {
- u8 *next_tweak, *final_tweak = req->iv;
- struct scatterlist *src, *dst;
- struct scatterlist s[2], d[2];
- le128 b[2];
-
- dst = src = scatterwalk_ffwd(s, req->src, req->cryptlen);
- if (req->dst != req->src)
- dst = scatterwalk_ffwd(d, req->dst, req->cryptlen);
-
- if (decrypt) {
- next_tweak = memcpy(b, req->iv, XTS_BLOCK_SIZE);
- gf128mul_x_ble(b, b);
- } else {
- next_tweak = req->iv;
- }
-
- skcipher_request_set_crypt(&subreq, src, dst, XTS_BLOCK_SIZE,
- next_tweak);
-
- err = skcipher_walk_virt(&walk, req, false) ?:
- skcipher_walk_done(&walk,
- __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
- if (err)
- goto out;
-
- scatterwalk_map_and_copy(b, dst, 0, XTS_BLOCK_SIZE, 0);
- memcpy(b + 1, b, tail - XTS_BLOCK_SIZE);
- scatterwalk_map_and_copy(b, src, XTS_BLOCK_SIZE,
- tail - XTS_BLOCK_SIZE, 0);
- scatterwalk_map_and_copy(b, dst, 0, tail, 1);
-
- skcipher_request_set_crypt(&subreq, dst, dst, XTS_BLOCK_SIZE,
- final_tweak);
-
- err = skcipher_walk_virt(&walk, req, false) ?:
- skcipher_walk_done(&walk,
- __glue_xts_req_128bit(gctx, crypt_ctx, &walk));
- }
-
-out:
- glue_fpu_end(fpu_enabled);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
-
-void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv, common_glue_func_t fn)
-{
- le128 ivblk = *iv;
-
- /* generate next IV */
- gf128mul_x_ble(iv, &ivblk);
-
- /* CC <- T xor C */
- u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk);
-
- /* PP <- D(Key2,CC) */
- fn(ctx, dst, dst);
-
- /* P <- T xor PP */
- u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk);
-}
-EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index ba9e4c1e7f5c..b7ee24df7fba 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -18,10 +18,6 @@
.align 16
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
.text
@@ -715,67 +711,3 @@ SYM_FUNC_START(serpent_cbc_dec_8way_avx)
FRAME_END
ret;
SYM_FUNC_END(serpent_cbc_dec_8way_avx)
-
-SYM_FUNC_START(serpent_ctr_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK1, RK2);
-
- call __serpent_enc_blk8_avx;
-
- store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_ctr_8way_avx)
-
-SYM_FUNC_START(serpent_xts_enc_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
-
- call __serpent_enc_blk8_avx;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_enc_8way_avx)
-
-SYM_FUNC_START(serpent_xts_dec_8way_avx)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
-
- call __serpent_dec_blk8_avx;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx.h b/arch/x86/crypto/serpent-avx.h
new file mode 100644
index 000000000000..23f3361a0e72
--- /dev/null
+++ b/arch/x86/crypto/serpent-avx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_X86_SERPENT_AVX_H
+#define ASM_X86_SERPENT_AVX_H
+
+#include <crypto/b128ops.h>
+#include <crypto/serpent.h>
+#include <linux/types.h>
+
+struct crypto_skcipher;
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
+ const u8 *src);
+
+#endif
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index c9648aeae705..9161b6e441f3 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -20,16 +20,6 @@
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask_0:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask_1:
- .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-
.text
#define CTX %rdi
@@ -734,80 +724,3 @@ SYM_FUNC_START(serpent_cbc_dec_16way)
FRAME_END
ret;
SYM_FUNC_END(serpent_cbc_dec_16way)
-
-SYM_FUNC_START(serpent_ctr_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- tp);
-
- call __serpent_enc_blk16;
-
- store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_ctr_16way)
-
-SYM_FUNC_START(serpent_xts_enc_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- .Lxts_gf128mul_and_shl1_mask_0,
- .Lxts_gf128mul_and_shl1_mask_1);
-
- call __serpent_enc_blk16;
-
- store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_enc_16way)
-
-SYM_FUNC_START(serpent_xts_dec_16way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- vzeroupper;
-
- load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
- .Lxts_gf128mul_and_shl1_mask_0,
- .Lxts_gf128mul_and_shl1_mask_1);
-
- call __serpent_dec_blk16;
-
- store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
-
- vzeroupper;
-
- FRAME_END
- ret;
-SYM_FUNC_END(serpent_xts_dec_16way)
diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/crypto/serpent-sse2.h
index 860ca248914b..860ca248914b 100644
--- a/arch/x86/include/asm/crypto/serpent-sse2.h
+++ b/arch/x86/crypto/serpent-sse2.h
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index f973ace44ad3..ccf0b5fa4933 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -12,9 +12,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/serpent-avx.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
#define SERPENT_AVX2_PARALLEL_BLOCKS 16
@@ -23,158 +23,44 @@ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
{
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ecb = serpent_ecb_enc_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ctr = serpent_ctr_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ctr = serpent_ctr_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = __serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_enc_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .xts = serpent_xts_enc_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .xts = serpent_xts_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .ecb = serpent_ecb_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .cbc = serpent_cbc_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_xts = {
- .num_funcs = 3,
- .fpu_blocks_limit = 8,
-
- .funcs = { {
- .num_blocks = 16,
- .fn_u = { .xts = serpent_xts_dec_16way }
- }, {
- .num_blocks = 8,
- .fn_u = { .xts = serpent_xts_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_enc_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_dec_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -205,35 +91,6 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-avx2",
- .base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(serpent)",
- .base.cra_driver_name = "__xts-serpent-avx2",
- .base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = SERPENT_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
- .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .setkey = xts_serpent_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7806d1cbe854..6c248e1ea4ef 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -15,9 +15,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/serpent-avx.h>
+
+#include "serpent-avx.h"
+#include "ecb_cbc_helpers.h"
/* 8-way parallel cipher functions */
asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
@@ -32,191 +32,41 @@ asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
-asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
-
-asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
-
-asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
-
-void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
-
-void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt);
-}
-EXPORT_SYMBOL_GPL(serpent_xts_enc);
-
-void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt);
-}
-EXPORT_SYMBOL_GPL(serpent_xts_dec);
-
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
{
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-EXPORT_SYMBOL_GPL(xts_serpent_setkey);
-
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = serpent_ctr_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = __serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = serpent_xts_enc_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = serpent_xts_dec_8way_avx }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = serpent_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_enc_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&serpent_dec_xts, req,
- __serpent_encrypt, &ctx->tweak_ctx,
- &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -247,35 +97,6 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-avx",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(serpent)",
- .base.cra_driver_name = "__xts-serpent-avx",
- .base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = SERPENT_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct serpent_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SERPENT_MIN_KEY_SIZE,
- .max_keysize = 2 * SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .setkey = xts_serpent_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 4fed8d26b91a..d78f37e9b2cf 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -10,8 +10,6 @@
*
* CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
* Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
- * CTR part based on code (crypto/ctr.c) by:
- * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
*/
#include <linux/module.h>
@@ -22,8 +20,9 @@
#include <crypto/b128ops.h>
#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
-#include <asm/crypto/serpent-sse2.h>
-#include <asm/crypto/glue_helper.h>
+
+#include "serpent-sse2.h"
+#include "ecb_cbc_helpers.h"
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -31,130 +30,46 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s)
-{
- u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- unsigned int j;
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- ivs[j] = src[j];
-
- serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
- for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
- u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
-}
-
-static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, src, (u128 *)&ctrblk);
-}
-
-static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s,
- le128 *iv)
+static void serpent_decrypt_cbc_xway(const void *ctx, u8 *dst, const u8 *src)
{
- be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
- unsigned int i;
-
- for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
- if (dst != src)
- dst[i] = src[i];
-
- le128_to_be128(&ctrblks[i], iv);
- le128_inc(iv);
- }
+ u8 buf[SERPENT_PARALLEL_BLOCKS - 1][SERPENT_BLOCK_SIZE];
+ const u8 *s = src;
- serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ serpent_dec_blk_xway(ctx, dst, src);
+ crypto_xor(dst + SERPENT_BLOCK_SIZE, s, sizeof(buf));
}
-static const struct common_glue_ctx serpent_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_enc_blk_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_encrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = serpent_crypt_ctr_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = serpent_crypt_ctr }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = serpent_dec_blk_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = __serpent_decrypt }
- } }
-};
-
-static const struct common_glue_ctx serpent_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = serpent_decrypt_cbc_xway }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = __serpent_decrypt }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_enc, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_enc_blk_xway);
+ ECB_BLOCK(1, __serpent_encrypt);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&serpent_dec, req);
+ ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_dec_blk_xway);
+ ECB_BLOCK(1, __serpent_decrypt);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(__serpent_encrypt,
- req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(__serpent_encrypt);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&serpent_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&serpent_ctr, req);
+ CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_decrypt_cbc_xway);
+ CBC_DEC_BLOCK(1, __serpent_decrypt);
+ CBC_WALK_END();
}
static struct skcipher_alg serpent_algs[] = {
@@ -185,21 +100,6 @@ static struct skcipher_alg serpent_algs[] = {
.setkey = serpent_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(serpent)",
- .base.cra_driver_name = "__ctr-serpent-sse2",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct serpent_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = SERPENT_MIN_KEY_SIZE,
- .max_keysize = SERPENT_MAX_KEY_SIZE,
- .ivsize = SERPENT_BLOCK_SIZE,
- .chunksize = SERPENT_BLOCK_SIZE,
- .setkey = serpent_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index a5151393bb2f..37e63b3c664e 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -19,11 +19,6 @@
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
-.align 16
-.Lxts_gf128mul_and_shl1_mask:
- .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
-
.text
/* structure of crypto context */
@@ -379,78 +374,3 @@ SYM_FUNC_START(twofish_cbc_dec_8way)
FRAME_END
ret;
SYM_FUNC_END(twofish_cbc_dec_8way)
-
-SYM_FUNC_START(twofish_ctr_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (little endian, 128bit)
- */
- FRAME_BEGIN
-
- pushq %r12;
-
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
- RD2, RX0, RX1, RY0);
-
- call __twofish_enc_blk8;
-
- store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
-
- popq %r12;
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_ctr_8way)
-
-SYM_FUNC_START(twofish_xts_enc_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
- RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
-
- call __twofish_enc_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_xts_enc_8way)
-
-SYM_FUNC_START(twofish_xts_dec_8way)
- /* input:
- * %rdi: ctx, CTX
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
- */
- FRAME_BEGIN
-
- movq %rsi, %r11;
-
- /* regs <= src, dst <= IVs, regs <= regs xor IVs */
- load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
- RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
-
- call __twofish_dec_blk8;
-
- /* dst <= regs xor IVs(in dst) */
- store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
-
- FRAME_END
- ret;
-SYM_FUNC_END(twofish_xts_dec_8way)
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/crypto/twofish.h
index 2c377a8042e1..12df400e6d53 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/crypto/twofish.h
@@ -17,9 +17,5 @@ asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src);
/* helpers from twofish_x86_64-3way module */
extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src);
-extern void twofish_enc_blk_ctr(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-extern void twofish_enc_blk_ctr_3way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 2dbc8ce3730e..3eb3440b477a 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -15,9 +15,9 @@
#include <crypto/algapi.h>
#include <crypto/internal/simd.h>
#include <crypto/twofish.h>
-#include <crypto/xts.h>
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/twofish.h>
+
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
#define TWOFISH_PARALLEL_BLOCKS 8
@@ -26,13 +26,6 @@ asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -45,171 +38,38 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk);
-}
-
-static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk);
-}
-
-struct twofish_xts_ctx {
- struct twofish_ctx tweak_ctx;
- struct twofish_ctx crypt_ctx;
-};
-
-static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- /* first half of xts-key is for crypt */
- err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2);
- if (err)
- return err;
-
- /* second half of xts-key is for tweak */
- return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
-}
-
-static const struct common_glue_ctx twofish_enc = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = twofish_ecb_enc_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_enc_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ctr = twofish_ctr_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = twofish_enc_blk_ctr }
- } }
-};
-
-static const struct common_glue_ctx twofish_enc_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = twofish_xts_enc_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = twofish_xts_enc }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = twofish_ecb_dec_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_dec_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
- .num_funcs = 3,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .cbc = twofish_cbc_dec_8way }
- }, {
- .num_blocks = 3,
- .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_xts = {
- .num_funcs = 2,
- .fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
-
- .funcs = { {
- .num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = twofish_xts_dec_8way }
- }, {
- .num_blocks = 1,
- .fn_u = { .xts = twofish_xts_dec }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_enc, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_enc_8way);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_dec, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ ECB_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_ecb_dec_8way);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&twofish_ctr, req);
-}
-
-static int xts_encrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk,
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, TWOFISH_PARALLEL_BLOCKS);
+ CBC_DEC_BLOCK(TWOFISH_PARALLEL_BLOCKS, twofish_cbc_dec_8way);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg twofish_algs[] = {
@@ -240,35 +100,6 @@ static struct skcipher_alg twofish_algs[] = {
.setkey = twofish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "__ctr(twofish)",
- .base.cra_driver_name = "__ctr-twofish-avx",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct twofish_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = TF_MIN_KEY_SIZE,
- .max_keysize = TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .chunksize = TF_BLOCK_SIZE,
- .setkey = twofish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
- }, {
- .base.cra_name = "__xts(twofish)",
- .base.cra_driver_name = "__xts-twofish-avx",
- .base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
- .base.cra_blocksize = TF_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct twofish_xts_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * TF_MIN_KEY_SIZE,
- .max_keysize = 2 * TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .setkey = xts_twofish_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
},
};
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 768af6075479..03725696397c 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -5,17 +5,16 @@
* Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
*/
-#include <asm/crypto/glue_helper.h>
-#include <asm/crypto/twofish.h>
#include <crypto/algapi.h>
-#include <crypto/b128ops.h>
-#include <crypto/internal/skcipher.h>
#include <crypto/twofish.h>
#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
+#include "twofish.h"
+#include "ecb_cbc_helpers.h"
+
EXPORT_SYMBOL_GPL(__twofish_enc_blk_3way);
EXPORT_SYMBOL_GPL(twofish_dec_blk_3way);
@@ -30,143 +29,48 @@ static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst,
- const u8 *src)
-{
- __twofish_enc_blk_3way(ctx, dst, src, true);
-}
-
-void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s)
+void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src)
{
- u128 ivs[2];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- ivs[0] = src[0];
- ivs[1] = src[1];
+ u8 buf[2][TF_BLOCK_SIZE];
+ const u8 *s = src;
- twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+ if (dst == src)
+ s = memcpy(buf, src, sizeof(buf));
+ twofish_dec_blk_3way(ctx, dst, src);
+ crypto_xor(dst + TF_BLOCK_SIZE, s, sizeof(buf));
- u128_xor(&dst[1], &dst[1], &ivs[0]);
- u128_xor(&dst[2], &dst[2], &ivs[1]);
}
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
-void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblk;
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src)
- *dst = *src;
-
- le128_to_be128(&ctrblk, iv);
- le128_inc(iv);
-
- twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
- u128_xor(dst, dst, (u128 *)&ctrblk);
-}
-EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
-
-void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
-{
- be128 ctrblks[3];
- u128 *dst = (u128 *)d;
- const u128 *src = (const u128 *)s;
-
- if (dst != src) {
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- }
-
- le128_to_be128(&ctrblks[0], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[1], iv);
- le128_inc(iv);
- le128_to_be128(&ctrblks[2], iv);
- le128_inc(iv);
-
- twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
-EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr_3way);
-
-static const struct common_glue_ctx twofish_enc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_enc_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_enc_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_ctr = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ctr = twofish_enc_blk_ctr }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .ecb = twofish_dec_blk_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .ecb = twofish_dec_blk }
- } }
-};
-
-static const struct common_glue_ctx twofish_dec_cbc = {
- .num_funcs = 2,
- .fpu_blocks_limit = -1,
-
- .funcs = { {
- .num_blocks = 3,
- .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
- }, {
- .num_blocks = 1,
- .fn_u = { .cbc = twofish_dec_blk }
- } }
-};
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_enc, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_enc_blk_3way);
+ ECB_BLOCK(1, twofish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return glue_ecb_req_128bit(&twofish_dec, req);
+ ECB_WALK_START(req, TF_BLOCK_SIZE, -1);
+ ECB_BLOCK(3, twofish_dec_blk_3way);
+ ECB_BLOCK(1, twofish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(twofish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- return glue_cbc_decrypt_req_128bit(&twofish_dec_cbc, req);
-}
-
-static int ctr_crypt(struct skcipher_request *req)
-{
- return glue_ctr_req_128bit(&twofish_ctr, req);
+ CBC_WALK_START(req, TF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(3, twofish_dec_blk_cbc_3way);
+ CBC_DEC_BLOCK(1, twofish_dec_blk);
+ CBC_WALK_END();
}
static struct skcipher_alg tf_skciphers[] = {
@@ -195,20 +99,6 @@ static struct skcipher_alg tf_skciphers[] = {
.setkey = twofish_setkey_skcipher,
.encrypt = cbc_encrypt,
.decrypt = cbc_decrypt,
- }, {
- .base.cra_name = "ctr(twofish)",
- .base.cra_driver_name = "ctr-twofish-3way",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct twofish_ctx),
- .base.cra_module = THIS_MODULE,
- .min_keysize = TF_MIN_KEY_SIZE,
- .max_keysize = TF_MAX_KEY_SIZE,
- .ivsize = TF_BLOCK_SIZE,
- .chunksize = TF_BLOCK_SIZE,
- .setkey = twofish_setkey_skcipher,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
},
};
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0904f5676e4d..4efd39aacb9f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -128,7 +128,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
regs->ax = -EFAULT;
instrumentation_end();
- syscall_exit_to_user_mode(regs);
+ local_irq_disable();
+ irqentry_exit_to_user_mode(regs);
return false;
}
@@ -249,30 +250,23 @@ static __always_inline bool get_and_clear_inhcall(void) { return false; }
static __always_inline void restore_inhcall(bool inhcall) { }
#endif
-static void __xen_pv_evtchn_do_upcall(void)
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
- irq_enter_rcu();
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
inc_irq_stat(irq_hv_callback_count);
xen_hvm_evtchn_do_upcall();
- irq_exit_rcu();
+ set_irq_regs(old_regs);
}
__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
{
- struct pt_regs *old_regs;
+ irqentry_state_t state = irqentry_enter(regs);
bool inhcall;
- irqentry_state_t state;
- state = irqentry_enter(regs);
- old_regs = set_irq_regs(regs);
-
- instrumentation_begin();
- run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
- instrumentation_begin();
-
- set_irq_regs(old_regs);
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..400908dff42e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -46,14 +46,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT_XXL
-SYM_CODE_START(native_usergs_sysret64)
- UNWIND_HINT_EMPTY
- swapgs
- sysretq
-SYM_CODE_END(native_usergs_sysret64)
-#endif /* CONFIG_PARAVIRT_XXL */
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -123,7 +115,12 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
+ * In the Xen PV case we must use iret anyway.
*/
+
+ ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
+ X86_FEATURE_XENPV
+
movq RCX(%rsp), %rcx
movq RIP(%rsp), %r11
@@ -215,7 +212,8 @@ syscall_return_via_sysret:
popq %rdi
popq %rsp
- USERGS_SYSRET64
+ swapgs
+ sysretq
SYM_CODE_END(entry_SYSCALL_64)
/*
@@ -669,7 +667,7 @@ native_irq_return_ldt:
*/
pushq %rdi /* Stash user RDI */
- SWAPGS /* to kernel GS */
+ swapgs /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi
@@ -699,7 +697,7 @@ native_irq_return_ldt:
orq PER_CPU_VAR(espfix_stack), %rax
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
- SWAPGS /* to user GS */
+ swapgs /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp
@@ -756,47 +754,6 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
SYM_CODE_END(.Lbad_gs)
.previous
-/*
- * rdi: New stack pointer points to the top word of the stack
- * rsi: Function pointer
- * rdx: Function argument (can be NULL if none)
- */
-SYM_FUNC_START(asm_call_on_stack)
-SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL)
-SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL)
- /*
- * Save the frame pointer unconditionally. This allows the ORC
- * unwinder to handle the stack switch.
- */
- pushq %rbp
- mov %rsp, %rbp
-
- /*
- * The unwinder relies on the word at the top of the new stack
- * page linking back to the previous RSP.
- */
- mov %rsp, (%rdi)
- mov %rdi, %rsp
- /* Move the argument to the right place */
- mov %rdx, %rdi
-
-1:
- .pushsection .discard.instr_begin
- .long 1b - .
- .popsection
-
- CALL_NOSPEC rsi
-
-2:
- .pushsection .discard.instr_end
- .long 2b - .
- .popsection
-
- /* Restore the previous stack pointer from RBP. */
- leaveq
- ret
-SYM_FUNC_END(asm_call_on_stack)
-
#ifdef CONFIG_XEN_PV
/*
* A note on the "critical region" in our callback handler.
@@ -943,7 +900,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
ret
.Lparanoid_entry_swapgs:
- SWAPGS
+ swapgs
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
@@ -1001,7 +958,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GSBASE */
- SWAPGS_UNSAFE_STACK
+ swapgs
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
@@ -1426,7 +1383,7 @@ nmi_no_fsgsbase:
jnz nmi_restore
nmi_swapgs:
- SWAPGS_UNSAFE_STACK
+ swapgs
nmi_restore:
POP_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 541fdaf64045..0051cf5c792d 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -210,6 +210,8 @@ SYM_CODE_START(entry_SYSCALL_compat)
/* Switch to the kernel stack */
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
+
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index 6fb9b57ed5ba..d8c4f6c9eadc 100644
--- a/arch/x86/entry/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -6,8 +6,8 @@ uapi := arch/$(SRCARCH)/include/generated/uapi/asm
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
$(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)')
-syscall32 := $(srctree)/$(src)/syscall_32.tbl
-syscall64 := $(srctree)/$(src)/syscall_64.tbl
+syscall32 := $(src)/syscall_32.tbl
+syscall64 := $(src)/syscall_64.tbl
syshdr := $(srctree)/$(src)/syscallhdr.sh
systbl := $(srctree)/$(src)/syscalltbl.sh
@@ -21,37 +21,37 @@ quiet_cmd_systbl = SYSTBL $@
cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
quiet_cmd_hypercalls = HYPERCALLS $@
- cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
+ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs))
syshdr_abi_unistd_32 := i386
-$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
+$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
syshdr_abi_unistd_32_ia32 := i386
syshdr_pfx_unistd_32_ia32 := ia32_
-$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
syshdr_abi_unistd_x32 := common,x32
syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
-$(uapi)/unistd_x32.h: $(syscall64) $(syshdr)
+$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
syshdr_abi_unistd_64 := common,64
-$(uapi)/unistd_64.h: $(syscall64) $(syshdr)
+$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
syshdr_abi_unistd_64_x32 := x32
syshdr_pfx_unistd_64_x32 := x32_
-$(out)/unistd_64_x32.h: $(syscall64) $(syshdr)
+$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
-$(out)/syscalls_32.h: $(syscall32) $(systbl)
+$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE
$(call if_changed,systbl)
-$(out)/syscalls_64.h: $(syscall64) $(systbl)
+$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE
$(call if_changed,systbl)
-$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE
$(call if_changed,hypercalls)
$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
@@ -62,9 +62,10 @@ syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
syshdr-$(CONFIG_X86_64) += syscalls_64.h
syshdr-$(CONFIG_XEN) += xen-hypercalls.h
-targets += $(uapisyshdr-y) $(syshdr-y)
+uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y))
+syshdr-y := $(addprefix $(out)/, $(syshdr-y))
+targets += $(addprefix ../../../../, $(uapisyshdr-y) $(syshdr-y))
PHONY += all
-all: $(addprefix $(uapi)/,$(uapisyshdr-y))
-all: $(addprefix $(out)/,$(syshdr-y))
+all: $(uapisyshdr-y) $(syshdr-y)
@:
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 874aeacde2dd..a1c9f496fca6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -446,3 +446,4 @@
439 i386 faccessat2 sys_faccessat2
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
+442 i386 mount_setattr sys_mount_setattr
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 78672124d28b..7bf01cbe582f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -363,6 +363,7 @@
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
+442 common mount_setattr sys_mount_setattr
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 02e3e42f380b..05c4abc2fdfd 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -150,6 +150,7 @@ KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index e37de298a495..18df17129695 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -81,6 +81,12 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+/*
+ * This one is magic, it will get called even when PMU init fails (because
+ * there is no PMU), in which case it should simply return NULL.
+ */
+DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -253,6 +259,8 @@ static bool check_hw_exists(void)
if (ret)
goto msr_fail;
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i))
+ continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
val_fail = val;
@@ -665,6 +673,12 @@ void x86_pmu_disable_all(void)
}
}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ return static_call(x86_pmu_guest_get_msrs)(nr);
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
/*
* There may be PMI landing after enabled=0. The PMI hitting could be before or
* after disable_all.
@@ -1523,6 +1537,8 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1923,6 +1939,8 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
+
+ static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -1995,12 +2013,17 @@ static int __init init_hw_perf_events(void)
pr_info("... generic registers: %d\n", x86_pmu.num_counters);
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
+ if (!x86_pmu.guest_get_msrs)
+ x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
+
x86_pmu_static_call_update();
/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index d4569bfa83e3..37ce38403cb8 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -275,6 +275,55 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_spr_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* INST_RETIRED.PREC_DIST */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6),
+ METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7),
+
+ INTEL_EVENT_CONSTRAINT(0x2e, 0xff),
+ INTEL_EVENT_CONSTRAINT(0x3c, 0xff),
+ /*
+ * Generally event codes < 0x90 are restricted to counters 0-3.
+ * The 0x2E and 0x3C are exception, which has no restriction.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf),
+
+ INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf),
+ INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1),
+ INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1),
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1),
+ INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf),
+ /*
+ * Generally event codes >= 0x90 are likely to have no restrictions.
+ * The exception are defined as above.
+ */
+ INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff),
+
+ EVENT_CONSTRAINT_END
+};
+
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -314,11 +363,15 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles,
EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale,
"4", "2");
-EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
-EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
-EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
-EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
-EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4");
+EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80");
+EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81");
+EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82");
+EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83");
+EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84");
+EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85");
+EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86");
+EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87");
static struct attribute *snb_events_attrs[] = {
EVENT_PTR(td_slots_issued),
@@ -384,6 +437,108 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+static __initconst const u64 spr_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe124,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_MISS) ] = 0xe424,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0,
+ [ C(RESULT_MISS) ] = 0xe12,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0,
+ [ C(RESULT_MISS) ] = 0xe13,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = 0xe11,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4c4,
+ [ C(RESULT_MISS) ] = 0x4c5,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x12a,
+ [ C(RESULT_MISS) ] = 0x12a,
+ },
+ },
+};
+
+static __initconst const u64 spr_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10001,
+ [ C(RESULT_MISS) ] = 0x3fbfc00001,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x3f3ffc0002,
+ [ C(RESULT_MISS) ] = 0x3f3fc00002,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10c000001,
+ [ C(RESULT_MISS) ] = 0x3fb3000001,
+ },
+ },
+};
+
/*
* Notes on the events:
* - data reads do not include code reads (comparable to earlier tables)
@@ -2134,18 +2289,6 @@ static void intel_tfa_pmu_enable_all(int added)
intel_pmu_enable_all(added);
}
-static void enable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() |
- DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
-static void disable_counter_freeze(void)
-{
- update_debugctlmsr(get_debugctlmsr() &
- ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI);
-}
-
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -2337,8 +2480,8 @@ static void __icl_update_topdown_event(struct perf_event *event,
}
}
-static void update_saved_topdown_regs(struct perf_event *event,
- u64 slots, u64 metrics)
+static void update_saved_topdown_regs(struct perf_event *event, u64 slots,
+ u64 metrics, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2347,7 +2490,7 @@ static void update_saved_topdown_regs(struct perf_event *event,
event->hw.saved_slots = slots;
event->hw.saved_metric = metrics;
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2362,7 +2505,8 @@ static void update_saved_topdown_regs(struct perf_event *event,
* The PERF_METRICS and Fixed counter 3 are read separately. The values may be
* modify by a NMI. PMU has to be disabled before calling this function.
*/
-static u64 icl_update_topdown_event(struct perf_event *event)
+
+static u64 intel_update_topdown_event(struct perf_event *event, int metric_end)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *other;
@@ -2378,7 +2522,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
/* read PERF_METRICS */
rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics);
- for_each_set_bit(idx, cpuc->active_mask, INTEL_PMC_IDX_TD_BE_BOUND + 1) {
+ for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) {
if (!is_topdown_idx(idx))
continue;
other = cpuc->events[idx];
@@ -2404,7 +2548,7 @@ static u64 icl_update_topdown_event(struct perf_event *event)
* Don't need to reset the PERF_METRICS and Fixed counter 3.
* Because the values will be restored in next schedule in.
*/
- update_saved_topdown_regs(event, slots, metrics);
+ update_saved_topdown_regs(event, slots, metrics, metric_end);
reset = false;
}
@@ -2413,12 +2557,18 @@ static u64 icl_update_topdown_event(struct perf_event *event)
wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0);
wrmsrl(MSR_PERF_METRICS, 0);
if (event)
- update_saved_topdown_regs(event, 0, 0);
+ update_saved_topdown_regs(event, 0, 0, metric_end);
}
return slots;
}
+static u64 icl_update_topdown_event(struct perf_event *event)
+{
+ return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE +
+ x86_pmu.num_topdown_events - 1);
+}
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2573,8 +2723,11 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx))
+ continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -2709,95 +2862,6 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
return handled;
}
-static bool disable_counter_freezing = true;
-static int __init intel_perf_counter_freezing_setup(char *s)
-{
- bool res;
-
- if (kstrtobool(s, &res))
- return -EINVAL;
-
- disable_counter_freezing = !res;
- return 1;
-}
-__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
-
-/*
- * Simplified handler for Arch Perfmon v4:
- * - We rely on counter freezing/unfreezing to enable/disable the PMU.
- * This is done automatically on PMU ack.
- * - Ack the PMU only after the APIC.
- */
-
-static int intel_pmu_handle_irq_v4(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- int handled = 0;
- bool bts = false;
- u64 status;
- int pmu_enabled = cpuc->enabled;
- int loops = 0;
-
- /* PMU has been disabled because of counter freezing */
- cpuc->enabled = 0;
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- bts = true;
- intel_bts_disable_local();
- handled = intel_pmu_drain_bts_buffer();
- handled += intel_bts_interrupt();
- }
- status = intel_pmu_get_status();
- if (!status)
- goto done;
-again:
- intel_pmu_lbr_read();
- if (++loops > 100) {
- static bool warned;
-
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
-
-
- handled += handle_pmi_common(regs, status);
-done:
- /* Ack the PMI in the APIC */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /*
- * The counters start counting immediately while ack the status.
- * Make it as close as possible to IRET. This avoids bogus
- * freezing on Skylake CPUs.
- */
- if (status) {
- intel_pmu_ack_status(status);
- } else {
- /*
- * CPU may issues two PMIs very close to each other.
- * When the PMI handler services the first one, the
- * GLOBAL_STATUS is already updated to reflect both.
- * When it IRETs, the second PMI is immediately
- * handled and it sees clear status. At the meantime,
- * there may be a third PMI, because the freezing bit
- * isn't set since the ack in first PMI handlers.
- * Double check if there is more work to be done.
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
- }
-
- if (bts)
- intel_bts_enable_local();
- cpuc->enabled = pmu_enabled;
- return handled;
-}
-
/*
* This handler is triggered by the local APIC, so the APIC IRQ handling
* rules apply:
@@ -3563,6 +3627,26 @@ static int core_pmu_hw_config(struct perf_event *event)
return intel_pmu_bts_config(event);
}
+#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \
+ ((x86_pmu.num_topdown_events - 1) << 8))
+
+static bool is_available_metric_event(struct perf_event *event)
+{
+ return is_metric_event(event) &&
+ event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX;
+}
+
+static inline bool is_mem_loads_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01);
+}
+
+static inline bool is_mem_loads_aux_event(struct perf_event *event)
+{
+ return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
+}
+
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -3575,11 +3659,16 @@ static int intel_pmu_hw_config(struct perf_event *event)
return ret;
if (event->attr.precise_ip) {
+ if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT)
+ return -EINVAL;
+
if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
if (!(event->attr.sample_type &
- ~intel_pmu_large_pebs_flags(event)))
+ ~intel_pmu_large_pebs_flags(event))) {
event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+ }
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
@@ -3592,6 +3681,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
/*
* BTS is set up earlier in this path, so don't account twice
@@ -3636,7 +3726,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.config & X86_ALL_EVENT_FLAGS)
return -EINVAL;
- if (is_metric_event(event)) {
+ if (is_available_metric_event(event)) {
struct perf_event *leader = event->group_leader;
/* The metric events don't support sampling. */
@@ -3665,6 +3755,33 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
}
+ /*
+ * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR
+ * doesn't function quite right. As a work-around it needs to always be
+ * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82).
+ * The actual count of this second event is irrelevant it just needs
+ * to be active to make the first event function correctly.
+ *
+ * In a group, the auxiliary event must be in front of the load latency
+ * event. The rule is to simplify the implementation of the check.
+ * That's because perf cannot have a complete group at the moment.
+ */
+ if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
+ is_mem_loads_event(event)) {
+ struct perf_event *leader = event->group_leader;
+ struct perf_event *sibling = NULL;
+
+ if (!is_mem_loads_aux_event(leader)) {
+ for_each_sibling_event(sibling, leader) {
+ if (is_mem_loads_aux_event(sibling))
+ break;
+ }
+ if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list))
+ return -ENODATA;
+ }
+ }
+
if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
return 0;
@@ -3680,26 +3797,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
return 0;
}
-#ifdef CONFIG_RETPOLINE
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr);
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr);
-#endif
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-#ifdef CONFIG_RETPOLINE
- if (x86_pmu.guest_get_msrs == intel_guest_get_msrs)
- return intel_guest_get_msrs(nr);
- else if (x86_pmu.guest_get_msrs == core_guest_get_msrs)
- return core_guest_get_msrs(nr);
-#endif
- if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
- *nr = 0;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -3865,6 +3962,29 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
static struct event_constraint *
+spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = icl_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0. If a :ppp event which is not
+ * available on the GP counter 0, error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ if (c->idxmsk64 & BIT_ULL(0))
+ return &counter0_constraint;
+
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
@@ -3953,6 +4073,14 @@ static u64 nhm_limit_period(struct perf_event *event, u64 left)
return max(left, 32ULL);
}
+static u64 spr_limit_period(struct perf_event *event, u64 left)
+{
+ if (event->attr.precise_ip == 3)
+ return max(left, 128ULL);
+
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -4094,9 +4222,6 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- if (x86_pmu.counter_freezing)
- enable_counter_freeze();
-
/* Disable perf metrics if any added CPU doesn't support it. */
if (x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
@@ -4167,9 +4292,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dying(int cpu)
{
fini_debug_store_on_cpu(cpu);
-
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
}
void intel_cpuc_finish(struct cpu_hw_events *cpuc)
@@ -4397,6 +4519,9 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c),
INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e),
@@ -4561,39 +4686,6 @@ static __init void intel_nehalem_quirk(void)
}
}
-static const struct x86_cpu_desc counter_freezing_ucodes[] = {
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
- INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
- {}
-};
-
-static bool intel_counter_freezing_broken(void)
-{
- return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
-}
-
-static __init void intel_counter_freezing_quirk(void)
-{
- /* Check if it's already disabled */
- if (disable_counter_freezing)
- return;
-
- /*
- * If the system starts with the wrong ucode, leave the
- * counter-freezing feature permanently disabled.
- */
- if (intel_counter_freezing_broken()) {
- pr_info("PMU counter freezing disabled due to CPU errata,"
- "please upgrade microcode\n");
- x86_pmu.counter_freezing = false;
- x86_pmu.handle_irq = intel_pmu_handle_irq;
- }
-}
-
/*
* enable software workaround for errata:
* SNB: BJ122
@@ -4703,6 +4795,42 @@ static struct attribute *icl_tsx_events_attrs[] = {
NULL,
};
+
+EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82");
+
+static struct attribute *spr_events_attrs[] = {
+ EVENT_PTR(mem_ld_hsw),
+ EVENT_PTR(mem_st_spr),
+ EVENT_PTR(mem_ld_aux),
+ NULL,
+};
+
+static struct attribute *spr_td_events_attrs[] = {
+ EVENT_PTR(slots),
+ EVENT_PTR(td_retiring),
+ EVENT_PTR(td_bad_spec),
+ EVENT_PTR(td_fe_bound),
+ EVENT_PTR(td_be_bound),
+ EVENT_PTR(td_heavy_ops),
+ EVENT_PTR(td_br_mispredict),
+ EVENT_PTR(td_fetch_lat),
+ EVENT_PTR(td_mem_bound),
+ NULL,
+};
+
+static struct attribute *spr_tsx_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_capacity_read),
+ EVENT_PTR(tx_capacity_write),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
+ NULL,
+};
+
static ssize_t freeze_on_smi_show(struct device *cdev,
struct device_attribute *attr,
char *buf)
@@ -4926,7 +5054,7 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
struct event_constraint *c;
- unsigned int unused;
+ unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
int version, i;
@@ -4948,7 +5076,7 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full);
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
@@ -4972,15 +5100,15 @@ __init int intel_pmu_init(void)
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events, when not running in a hypervisor:
*/
- if (version > 1) {
+ if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
- }
- if (version >= 4)
- x86_pmu.counter_freezing = !disable_counter_freezing;
+ fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
+ } else if (version >= 5)
+ x86_pmu.num_counters_fixed = fls(fixed_mask);
if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
@@ -5109,7 +5237,6 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_D:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -5136,7 +5263,6 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -5483,12 +5609,50 @@ __init int intel_pmu_init(void)
x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 4;
x86_pmu.update_topdown_event = icl_update_topdown_event;
x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
pr_cont("Icelake events, ");
name = "icelake";
break;
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ pmem = true;
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ x86_pmu.event_constraints = intel_spr_event_constraints;
+ x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_spr_extra_regs;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = spr_get_event_constraints;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ hsw_format_attr : nhm_format_attr;
+ extra_skl_attr = skl_format_attr;
+ mem_attr = spr_events_attrs;
+ td_attr = spr_td_events_attrs;
+ tsx_attr = spr_tsx_events_attrs;
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(pmem);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = icl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ pr_cont("Sapphire Rapids events, ");
+ name = "sapphire_rapids";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5531,8 +5695,7 @@ __init int intel_pmu_init(void)
x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
}
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+ x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
@@ -5549,13 +5712,22 @@ __init int intel_pmu_init(void)
* events to the generic counters.
*/
if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
+ c->idxmsk64 = 0;
c->weight = hweight64(c->idxmsk64);
continue;
}
- if (c->cmask == FIXED_EVENT_FLAGS
- && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= x86_pmu.intel_ctrl;
+
+ if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
}
c->idxmsk64 &=
~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
@@ -5601,13 +5773,6 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- /*
- * For arch perfmon 4 use counter freezing to avoid
- * several MSR accesses in the PMI.
- */
- if (x86_pmu.counter_freezing)
- x86_pmu.handle_irq = intel_pmu_handle_irq_v4;
-
if (x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 67dbc91bccfe..d32b302719fe 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -36,7 +36,9 @@ union intel_x86_pebs_dse {
unsigned int ld_dse:4;
unsigned int ld_stlb_miss:1;
unsigned int ld_locked:1;
- unsigned int ld_reserved:26;
+ unsigned int ld_data_blk:1;
+ unsigned int ld_addr_blk:1;
+ unsigned int ld_reserved:24;
};
struct {
unsigned int st_l1d_hit:1;
@@ -45,6 +47,12 @@ union intel_x86_pebs_dse {
unsigned int st_locked:1;
unsigned int st_reserved2:26;
};
+ struct {
+ unsigned int st_lat_dse:4;
+ unsigned int st_lat_stlb_miss:1;
+ unsigned int st_lat_locked:1;
+ unsigned int ld_reserved3:26;
+ };
};
@@ -198,6 +206,63 @@ static u64 load_latency_data(u64 status)
if (dse.ld_locked)
val |= P(LOCK, LOCKED);
+ /*
+ * Ice Lake and earlier models do not support block infos.
+ */
+ if (!x86_pmu.pebs_block) {
+ val |= P(BLK, NA);
+ return val;
+ }
+ /*
+ * bit 6: load was blocked since its data could not be forwarded
+ * from a preceding store
+ */
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+
+ /*
+ * bit 7: load was blocked due to potential address conflict with
+ * a preceding store
+ */
+ if (dse.ld_addr_blk)
+ val |= P(BLK, ADDR);
+
+ if (!dse.ld_data_blk && !dse.ld_addr_blk)
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 store_latency_data(u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ dse.val = status;
+
+ /*
+ * use the mapping table for bit 0-3
+ */
+ val = pebs_data_source[dse.st_lat_dse];
+
+ /*
+ * bit 4: TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (dse.st_lat_stlb_miss)
+ val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /*
+ * bit 5: locked prefix
+ */
+ if (dse.st_lat_locked)
+ val |= P(LOCK, LOCKED);
+
+ val |= P(BLK, NA);
+
return val;
}
@@ -870,6 +935,28 @@ struct event_constraint intel_icl_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_spr_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL),
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xfe),
+ INTEL_PSD_CONSTRAINT(0x2cd, 0x1),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf),
+
+ INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf),
+
+ /*
+ * Everything else is handled by PMU_FL_PEBS_ALL, because we
+ * need the full constraints from the main table.
+ */
+
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -960,7 +1047,8 @@ static void adaptive_pebs_record_size_update(void)
}
#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \
- PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \
+ PERF_SAMPLE_PHYS_ADDR | \
+ PERF_SAMPLE_WEIGHT_TYPE | \
PERF_SAMPLE_TRANSACTION | \
PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -987,7 +1075,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
gprs = (sample_type & PERF_SAMPLE_REGS_INTR) &&
(attr->sample_regs_intr & PEBS_GP_REGS);
- tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) &&
+ tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
((attr->config & INTEL_ARCH_EVENT_MASK) ==
x86_pmu.rtm_abort_event);
@@ -1331,6 +1419,8 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
val = load_latency_data(aux);
+ else if (fl & PERF_X86_EVENT_PEBS_STLAT)
+ val = store_latency_data(aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
@@ -1369,8 +1459,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
/*
* Use latency for weight (only avail with PEBS-LL)
*/
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data->weight = pebs->lat;
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+ data->weight.full = pebs->lat;
/*
* data.data_src encodes the data source
@@ -1462,8 +1552,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
- data->weight = intel_get_tsx_weight(pebs->tsx_tuning);
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+ data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
if (sample_type & PERF_SAMPLE_TRANSACTION)
data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
@@ -1507,6 +1597,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#endif
}
+#define PEBS_LATENCY_MASK 0xffff
+#define PEBS_CACHE_LATENCY_OFFSET 32
+
/*
* With adaptive PEBS the layout depends on what fields are configured.
*/
@@ -1577,9 +1670,27 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
}
if (format_size & PEBS_DATACFG_MEMINFO) {
- if (sample_type & PERF_SAMPLE_WEIGHT)
- data->weight = meminfo->latency ?:
- intel_get_tsx_weight(meminfo->tsx_tuning);
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
+ u64 weight = meminfo->latency;
+
+ if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
+ data->weight.var2_w = weight & PEBS_LATENCY_MASK;
+ weight >>= PEBS_CACHE_LATENCY_OFFSET;
+ }
+
+ /*
+ * Although meminfo::latency is defined as a u64,
+ * only the lower 32 bits include the valid data
+ * in practice on Ice Lake and earlier platforms.
+ */
+ if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = weight ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ } else {
+ data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
+ intel_get_tsx_weight(meminfo->tsx_tuning);
+ }
+ }
if (sample_type & PERF_SAMPLE_DATA_SRC)
data->data_src.val = get_data_src(event, meminfo->aux);
@@ -1899,7 +2010,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
*/
if (!pebs_status && cpuc->pebs_enabled &&
!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
- pebs_status = cpuc->pebs_enabled;
+ pebs_status = p->status = cpuc->pebs_enabled;
bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 357258f82dc8..33c8180d5a87 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -31,21 +31,21 @@ struct event_constraint uncore_constraint_empty =
MODULE_LICENSE("GPL");
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+int uncore_pcibus_to_dieid(struct pci_bus *bus)
{
struct pci2phy_map *map;
- int phys_id = -1;
+ int die_id = -1;
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
if (map->segment == pci_domain_nr(bus)) {
- phys_id = map->pbus_to_physid[bus->number];
+ die_id = map->pbus_to_dieid[bus->number];
break;
}
}
raw_spin_unlock(&pci2phy_map_lock);
- return phys_id;
+ return die_id;
}
static void uncore_free_pcibus_map(void)
@@ -86,7 +86,7 @@ lookup:
alloc = NULL;
map->segment = segment;
for (i = 0; i < 256; i++)
- map->pbus_to_physid[i] = -1;
+ map->pbus_to_dieid[i] = -1;
list_add_tail(&map->list, &pci2phy_map_head);
end:
@@ -332,7 +332,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
uncore_pmu_init_hrtimer(box);
box->cpu = -1;
- box->pci_phys_id = -1;
box->dieid = -1;
/* set default hrtimer timeout */
@@ -993,18 +992,11 @@ uncore_types_init(struct intel_uncore_type **types, bool setid)
/*
* Get the die information of a PCI device.
* @pdev: The PCI device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static int uncore_pci_get_dev_die_info(struct pci_dev *pdev,
- int *phys_id, int *die)
+static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
{
- *phys_id = uncore_pcibus_to_physid(pdev->bus);
- if (*phys_id < 0)
- return -ENODEV;
-
- *die = (topology_max_die_per_package() > 1) ? *phys_id :
- topology_phys_to_logical_pkg(*phys_id);
+ *die = uncore_pcibus_to_dieid(pdev->bus);
if (*die < 0)
return -EINVAL;
@@ -1046,13 +1038,12 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
* @pdev: The PCI device.
* @type: The corresponding PMU type of the device.
* @pmu: The corresponding PMU of the device.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
static int uncore_pci_pmu_register(struct pci_dev *pdev,
struct intel_uncore_type *type,
struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+ int die)
{
struct intel_uncore_box *box;
int ret;
@@ -1070,7 +1061,6 @@ static int uncore_pci_pmu_register(struct pci_dev *pdev,
WARN_ON_ONCE(pmu->func_id != pdev->devfn);
atomic_inc(&box->refcnt);
- box->pci_phys_id = phys_id;
box->dieid = die;
box->pci_dev = pdev;
box->pmu = pmu;
@@ -1097,9 +1087,9 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
{
struct intel_uncore_type *type;
struct intel_uncore_pmu *pmu = NULL;
- int phys_id, die, ret;
+ int die, ret;
- ret = uncore_pci_get_dev_die_info(pdev, &phys_id, &die);
+ ret = uncore_pci_get_dev_die_info(pdev, &die);
if (ret)
return ret;
@@ -1132,7 +1122,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
}
- ret = uncore_pci_pmu_register(pdev, type, pmu, phys_id, die);
+ ret = uncore_pci_pmu_register(pdev, type, pmu, die);
pci_set_drvdata(pdev, pmu->boxes[die]);
@@ -1142,17 +1132,12 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
/*
* Unregister the PMU of a PCI device
* @pmu: The corresponding PMU is unregistered.
- * @phys_id: The physical socket id which the device maps to.
* @die: The die id which the device maps to.
*/
-static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu,
- int phys_id, int die)
+static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die)
{
struct intel_uncore_box *box = pmu->boxes[die];
- if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
- return;
-
pmu->boxes[die] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
@@ -1164,9 +1149,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box;
struct intel_uncore_pmu *pmu;
- int i, phys_id, die;
+ int i, die;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return;
box = pci_get_drvdata(pdev);
@@ -1185,7 +1170,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
pci_set_drvdata(pdev, NULL);
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
}
static int uncore_bus_notify(struct notifier_block *nb,
@@ -1194,7 +1179,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
struct intel_uncore_pmu *pmu;
- int phys_id, die;
+ int die;
/* Unregister the PMU when the device is going to be deleted. */
if (action != BUS_NOTIFY_DEL_DEVICE)
@@ -1204,10 +1189,10 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (!pmu)
return NOTIFY_DONE;
- if (uncore_pci_get_dev_die_info(pdev, &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pdev, &die))
return NOTIFY_DONE;
- uncore_pci_pmu_unregister(pmu, phys_id, die);
+ uncore_pci_pmu_unregister(pmu, die);
return NOTIFY_OK;
}
@@ -1224,7 +1209,7 @@ static void uncore_pci_sub_driver_init(void)
struct pci_dev *pci_sub_dev;
bool notify = false;
unsigned int devfn;
- int phys_id, die;
+ int die;
while (ids && ids->vendor) {
pci_sub_dev = NULL;
@@ -1244,12 +1229,11 @@ static void uncore_pci_sub_driver_init(void)
if (!pmu)
continue;
- if (uncore_pci_get_dev_die_info(pci_sub_dev,
- &phys_id, &die))
+ if (uncore_pci_get_dev_die_info(pci_sub_dev, &die))
continue;
if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu,
- phys_id, die))
+ die))
notify = true;
}
ids++;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 9efea154349d..a3c6e1643ad2 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -124,7 +124,6 @@ struct intel_uncore_extra_reg {
};
struct intel_uncore_box {
- int pci_phys_id;
int dieid; /* Logical die ID */
int n_active; /* number of active events */
int n_events;
@@ -173,11 +172,11 @@ struct freerunning_counters {
struct pci2phy_map {
struct list_head list;
int segment;
- int pbus_to_physid[256];
+ int pbus_to_dieid[256];
};
struct pci2phy_map *__find_pci2phy_map(int segment);
-int uncore_pcibus_to_physid(struct pci_bus *bus);
+int uncore_pcibus_to_dieid(struct pci_bus *bus);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 098f893e2e22..51271288499e 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -657,7 +657,7 @@ int snb_pci2phy_map_init(int devid)
pci_dev_put(dev);
return -ENOMEM;
}
- map->pbus_to_physid[bus] = 0;
+ map->pbus_to_dieid[bus] = 0;
raw_spin_unlock(&pci2phy_map_lock);
pci_dev_put(dev);
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 7bdb1821215d..b79951d0707c 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1359,7 +1359,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid, segment;
+ int i, bus, nodeid, segment, die_id;
struct pci2phy_map *map;
int err = 0;
u32 config = 0;
@@ -1370,36 +1370,77 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
if (!ubox_dev)
break;
bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
- if (err)
- break;
+ /*
+ * The nodeid and idmap registers only contain enough
+ * information to handle 8 nodes. On systems with more
+ * than 8 nodes, we need to rely on NUMA information,
+ * filled in from BIOS supplied information, to determine
+ * the topology.
+ */
+ if (nr_node_ids <= 8) {
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
+ if (err)
+ break;
+ nodeid = config & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ if (err)
+ break;
- segment = pci_domain_nr(ubox_dev->bus);
- raw_spin_lock(&pci2phy_map_lock);
- map = __find_pci2phy_map(segment);
- if (!map) {
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
+
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (topology_max_die_per_package() > 1)
+ die_id = i;
+ else
+ die_id = topology_phys_to_logical_pkg(i);
+ map->pbus_to_dieid[bus] = die_id;
+ break;
+ }
+ }
raw_spin_unlock(&pci2phy_map_lock);
- err = -ENOMEM;
- break;
- }
+ } else {
+ int node = pcibus_to_node(ubox_dev->bus);
+ int cpu;
+
+ segment = pci_domain_nr(ubox_dev->bus);
+ raw_spin_lock(&pci2phy_map_lock);
+ map = __find_pci2phy_map(segment);
+ if (!map) {
+ raw_spin_unlock(&pci2phy_map_lock);
+ err = -ENOMEM;
+ break;
+ }
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- map->pbus_to_physid[bus] = i;
+ die_id = -1;
+ for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node) {
+ map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
+ break;
+ }
+ }
+ raw_spin_unlock(&pci2phy_map_lock);
+
+ if (WARN_ON_ONCE(die_id == -1)) {
+ err = -EINVAL;
break;
}
}
- raw_spin_unlock(&pci2phy_map_lock);
}
if (!err) {
@@ -1412,17 +1453,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
i = -1;
if (reverse) {
for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
} else {
for (bus = 0; bus <= 255; bus++) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
+ if (map->pbus_to_dieid[bus] >= 0)
+ i = map->pbus_to_dieid[bus];
else
- map->pbus_to_physid[bus] = i;
+ map->pbus_to_dieid[bus] = i;
}
}
}
@@ -4646,19 +4687,14 @@ int snr_uncore_pci_init(void)
static struct pci_dev *snr_uncore_get_mc_dev(int id)
{
struct pci_dev *mc_dev = NULL;
- int phys_id, pkg;
+ int pkg;
while (1) {
mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev);
if (!mc_dev)
break;
- phys_id = uncore_pcibus_to_physid(mc_dev->bus);
- if (phys_id < 0)
- continue;
- pkg = topology_phys_to_logical_pkg(phys_id);
- if (pkg < 0)
- continue;
- else if (pkg == id)
+ pkg = uncore_pcibus_to_dieid(mc_dev->bus);
+ if (pkg == id)
break;
}
return mc_dev;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7895cf4c59a7..53b2b5fc23bc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -80,6 +80,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */
#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */
+#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -443,6 +444,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+#define INTEL_PSD_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT)
+
#define INTEL_PST_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
@@ -682,8 +687,7 @@ struct x86_pmu {
/* PMI handler bits */
unsigned int late_ack :1,
- enabled_ack :1,
- counter_freezing :1;
+ enabled_ack :1;
/*
* sysfs attrs
*/
@@ -724,7 +728,8 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1,
pebs_no_tlb :1,
- pebs_no_isolation :1;
+ pebs_no_isolation :1,
+ pebs_block :1;
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
@@ -776,6 +781,7 @@ struct x86_pmu {
/*
* Intel perf metrics
*/
+ int num_topdown_events;
u64 (*update_topdown_event)(struct perf_event *event);
int (*set_topdown_event_period)(struct perf_event *event);
@@ -871,6 +877,8 @@ do { \
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
+#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
+#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1060,6 +1068,11 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline bool fixed_counter_disabled(int i)
+{
+ return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+}
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -1157,6 +1170,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
extern struct event_constraint intel_icl_pebs_event_constraints[];
+extern struct event_constraint intel_spr_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_add(struct perf_event *event);
diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c
index 136a1e847254..600bf8d15c0c 100644
--- a/arch/x86/events/probe.c
+++ b/arch/x86/events/probe.c
@@ -28,6 +28,7 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
for (bit = 0; bit < cnt; bit++) {
if (!msr[bit].no_check) {
struct attribute_group *grp = msr[bit].grp;
+ u64 mask;
/* skip entry with no group */
if (!grp)
@@ -44,8 +45,12 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data)
/* Virt sucks; you cannot tell if a R/O MSR is present :/ */
if (rdmsrl_safe(msr[bit].msr, &val))
continue;
+
+ mask = msr[bit].mask;
+ if (!mask)
+ mask = ~0ULL;
/* Disable zero counters if requested. */
- if (!zero && !val)
+ if (!zero && !(val & mask))
continue;
grp->is_visible = NULL;
diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h
index 4c8e0afc5fb5..261b9bda24e3 100644
--- a/arch/x86/events/probe.h
+++ b/arch/x86/events/probe.h
@@ -4,10 +4,11 @@
#include <linux/sysfs.h>
struct perf_msr {
- u64 msr;
- struct attribute_group *grp;
+ u64 msr;
+ struct attribute_group *grp;
bool (*test)(int idx, void *data);
- bool no_check;
+ bool no_check;
+ u64 mask;
};
unsigned long
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 7dbbeaacd995..f42a70496a24 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -454,16 +454,9 @@ static struct attribute *rapl_events_cores[] = {
NULL,
};
-static umode_t
-rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i)
-{
- return 0;
-}
-
static struct attribute_group rapl_events_cores_group = {
.name = "events",
.attrs = rapl_events_cores,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_pkg[] = {
@@ -476,7 +469,6 @@ static struct attribute *rapl_events_pkg[] = {
static struct attribute_group rapl_events_pkg_group = {
.name = "events",
.attrs = rapl_events_pkg,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_ram[] = {
@@ -489,7 +481,6 @@ static struct attribute *rapl_events_ram[] = {
static struct attribute_group rapl_events_ram_group = {
.name = "events",
.attrs = rapl_events_ram,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_gpu[] = {
@@ -502,7 +493,6 @@ static struct attribute *rapl_events_gpu[] = {
static struct attribute_group rapl_events_gpu_group = {
.name = "events",
.attrs = rapl_events_gpu,
- .is_visible = rapl_not_visible,
};
static struct attribute *rapl_events_psys[] = {
@@ -515,7 +505,6 @@ static struct attribute *rapl_events_psys[] = {
static struct attribute_group rapl_events_psys_group = {
.name = "events",
.attrs = rapl_events_psys,
- .is_visible = rapl_not_visible,
};
static bool test_msr(int idx, void *data)
@@ -523,12 +512,23 @@ static bool test_msr(int idx, void *data)
return test_bit(idx, (unsigned long *) data);
}
+/* Only lower 32bits of the MSR represents the energy counter */
+#define RAPL_MSR_MASK 0xFFFFFFFF
+
static struct perf_msr intel_rapl_msrs[] = {
- [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr },
- [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
- [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr },
- [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr },
- [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr },
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
+};
+
+static struct perf_msr intel_rapl_spr_msrs[] = {
+ [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
};
/*
@@ -761,7 +761,7 @@ static struct rapl_model model_spr = {
BIT(PERF_RAPL_PSYS),
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
.msr_power_unit = MSR_RAPL_POWER_UNIT,
- .rapl_msrs = intel_rapl_msrs,
+ .rapl_msrs = intel_rapl_spr_msrs,
};
static struct rapl_model model_amd_fam17h = {
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 89b1f74d3225..48e2c51464e8 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y := hv_init.o mmu.o nested.o
-obj-$(CONFIG_X86_64) += hv_apic.o
+obj-y := hv_init.o mmu.o nested.o irqdomain.o
+obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 6375967a8244..b81047dec1da 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -10,6 +10,7 @@
#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
+#include <linux/bitfield.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
@@ -26,8 +27,11 @@
#include <linux/cpuhotplug.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
+#include <linux/highmem.h>
int hyperv_init_cpuhp;
+u64 hv_current_partition_id = ~0ull;
+EXPORT_SYMBOL_GPL(hv_current_partition_id);
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
@@ -44,6 +48,9 @@ EXPORT_SYMBOL_GPL(hv_vp_assist_page);
void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
+void __percpu **hyperv_pcpu_output_arg;
+EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
+
u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);
@@ -76,12 +83,19 @@ static int hv_cpu_init(unsigned int cpu)
void **input_arg;
struct page *pg;
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
- pg = alloc_page(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL);
+ pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
if (unlikely(!pg))
return -ENOMEM;
+
+ input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
*input_arg = page_address(pg);
+ if (hv_root_partition) {
+ void **output_arg;
+
+ output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+ *output_arg = page_address(pg + 1);
+ }
hv_get_vp_index(msr_vp_index);
@@ -208,14 +222,23 @@ static int hv_cpu_die(unsigned int cpu)
unsigned int new_cpu;
unsigned long flags;
void **input_arg;
- void *input_pg = NULL;
+ void *pg;
local_irq_save(flags);
input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- input_pg = *input_arg;
+ pg = *input_arg;
*input_arg = NULL;
+
+ if (hv_root_partition) {
+ void **output_arg;
+
+ output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
+ *output_arg = NULL;
+ }
+
local_irq_restore(flags);
- free_page((unsigned long)input_pg);
+
+ free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
if (hv_vp_assist_page && hv_vp_assist_page[cpu])
wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
@@ -264,6 +287,9 @@ static int hv_suspend(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
int ret;
+ if (hv_root_partition)
+ return -EPERM;
+
/*
* Reset the hypercall page as it is going to be invalidated
* accross hibernation. Setting hv_hypercall_pg to NULL ensures
@@ -334,6 +360,24 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
+static void __init hv_get_partition_id(void)
+{
+ struct hv_get_partition_id *output_page;
+ u64 status;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ /* No point in proceeding if this failed */
+ pr_err("Failed to get partition ID: %lld\n", status);
+ BUG();
+ }
+ hv_current_partition_id = output_page->partition_id;
+ local_irq_restore(flags);
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -368,6 +412,12 @@ void __init hyperv_init(void)
BUG_ON(hyperv_pcpu_input_arg == NULL);
+ /* Allocate the per-CPU state for output arg for root */
+ if (hv_root_partition) {
+ hyperv_pcpu_output_arg = alloc_percpu(void *);
+ BUG_ON(hyperv_pcpu_output_arg == NULL);
+ }
+
/* Allocate percpu VP index */
hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
GFP_KERNEL);
@@ -408,8 +458,35 @@ void __init hyperv_init(void)
rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
hypercall_msr.enable = 1;
- hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
- wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ if (hv_root_partition) {
+ struct page *pg;
+ void *src, *dst;
+
+ /*
+ * For the root partition, the hypervisor will set up its
+ * hypercall page. The hypervisor guarantees it will not show
+ * up in the root's address space. The root can't change the
+ * location of the hypercall page.
+ *
+ * Order is important here. We must enable the hypercall page
+ * so it is populated with code, then copy the code to an
+ * executable page.
+ */
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ pg = vmalloc_to_page(hv_hypercall_pg);
+ dst = kmap(pg);
+ src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE,
+ MEMREMAP_WB);
+ BUG_ON(!(src && dst));
+ memcpy(dst, src, HV_HYP_PAGE_SIZE);
+ memunmap(src);
+ kunmap(pg);
+ } else {
+ hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ }
/*
* hyperv_init() is called before LAPIC is initialized: see
@@ -428,6 +505,21 @@ void __init hyperv_init(void)
register_syscore_ops(&hv_syscore_ops);
hyperv_init_cpuhp = cpuhp;
+
+ if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID)
+ hv_get_partition_id();
+
+ BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull);
+
+#ifdef CONFIG_PCI_MSI
+ /*
+ * If we're running as root, we want to create our own PCI MSI domain.
+ * We can't set this in hv_pci_init because that would be too late.
+ */
+ if (hv_root_partition)
+ x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
+#endif
+
return;
remove_cpuhp_state:
@@ -552,6 +644,20 @@ EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
bool hv_is_hibernation_supported(void)
{
- return acpi_sleep_state_supported(ACPI_STATE_S4);
+ return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
}
EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.features_b & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+bool hv_is_isolation_supported(void)
+{
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
new file mode 100644
index 000000000000..60461e598239
--- /dev/null
+++ b/arch/x86/hyperv/hv_proc.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/clockchips.h>
+#include <linux/acpi.h>
+#include <linux/hyperv.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>
+#include <linux/minmax.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
+#include <asm/apic.h>
+
+#include <asm/trace/hyperv.h>
+
+/*
+ * See struct hv_deposit_memory. The first u64 is partition ID, the rest
+ * are GPAs.
+ */
+#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
+
+/* Deposits exact number of pages. Must be called with interrupts enabled. */
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
+{
+ struct page **pages, *page;
+ int *counts;
+ int num_allocations;
+ int i, j, page_count;
+ int order;
+ u64 status;
+ int ret;
+ u64 base_pfn;
+ struct hv_deposit_memory *input_page;
+ unsigned long flags;
+
+ if (num_pages > HV_DEPOSIT_MAX)
+ return -E2BIG;
+ if (!num_pages)
+ return 0;
+
+ /* One buffer for page pointers and counts */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ pages = page_address(page);
+
+ counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL);
+ if (!counts) {
+ free_page((unsigned long)pages);
+ return -ENOMEM;
+ }
+
+ /* Allocate all the pages before disabling interrupts */
+ i = 0;
+
+ while (num_pages) {
+ /* Find highest order we can actually allocate */
+ order = 31 - __builtin_clz(num_pages);
+
+ while (1) {
+ pages[i] = alloc_pages_node(node, GFP_KERNEL, order);
+ if (pages[i])
+ break;
+ if (!order) {
+ ret = -ENOMEM;
+ num_allocations = i;
+ goto err_free_allocations;
+ }
+ --order;
+ }
+
+ split_page(pages[i], order);
+ counts[i] = 1 << order;
+ num_pages -= counts[i];
+ i++;
+ }
+ num_allocations = i;
+
+ local_irq_save(flags);
+
+ input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input_page->partition_id = partition_id;
+
+ /* Populate gpa_page_list - these will fit on the input page */
+ for (i = 0, page_count = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j, ++page_count)
+ input_page->gpa_page_list[page_count] = base_pfn + j;
+ }
+ status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
+ page_count, 0, input_page, NULL);
+ local_irq_restore(flags);
+
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ pr_err("Failed to deposit pages: %lld\n", status);
+ ret = status;
+ goto err_free_allocations;
+ }
+
+ ret = 0;
+ goto free_buf;
+
+err_free_allocations:
+ for (i = 0; i < num_allocations; ++i) {
+ base_pfn = page_to_pfn(pages[i]);
+ for (j = 0; j < counts[i]; ++j)
+ __free_page(pfn_to_page(base_pfn + j));
+ }
+
+free_buf:
+ free_page((unsigned long)pages);
+ kfree(counts);
+ return ret;
+}
+
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
+{
+ struct hv_add_logical_processor_in *input;
+ struct hv_add_logical_processor_out *output;
+ u64 status;
+ unsigned long flags;
+ int ret = 0;
+ int pxm = node_to_pxm(node);
+
+ /*
+ * When adding a logical processor, the hypervisor may return
+ * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more
+ * pages and retry.
+ */
+ do {
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /* We don't do anything with the output right now */
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ input->lp_index = lp_index;
+ input->apic_id = apic_id;
+ input->flags = 0;
+ input->proximity_domain_info.domain_id = pxm;
+ input->proximity_domain_info.flags.reserved = 0;
+ input->proximity_domain_info.flags.proximity_info_valid = 1;
+ input->proximity_domain_info.flags.proximity_preferred = 1;
+ status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR,
+ input, output);
+ local_irq_restore(flags);
+
+ status &= HV_HYPERCALL_RESULT_MASK;
+
+ if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
+ lp_index, apic_id, status);
+ ret = status;
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
+ } while (!ret);
+
+ return ret;
+}
+
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
+{
+ struct hv_create_vp *input;
+ u64 status;
+ unsigned long irq_flags;
+ int ret = 0;
+ int pxm = node_to_pxm(node);
+
+ /* Root VPs don't seem to need pages deposited */
+ if (partition_id != hv_current_partition_id) {
+ /* The value 90 is empirically determined. It may change. */
+ ret = hv_call_deposit_pages(node, partition_id, 90);
+ if (ret)
+ return ret;
+ }
+
+ do {
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ input->partition_id = partition_id;
+ input->vp_index = vp_index;
+ input->flags = flags;
+ input->subnode_type = HvSubnodeAny;
+ if (node != NUMA_NO_NODE) {
+ input->proximity_domain_info.domain_id = pxm;
+ input->proximity_domain_info.flags.reserved = 0;
+ input->proximity_domain_info.flags.proximity_info_valid = 1;
+ input->proximity_domain_info.flags.proximity_preferred = 1;
+ } else {
+ input->proximity_domain_info.as_uint64 = 0;
+ }
+ status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
+ local_irq_restore(irq_flags);
+
+ status &= HV_HYPERCALL_RESULT_MASK;
+
+ if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
+ vp_index, flags, status);
+ ret = status;
+ }
+ break;
+ }
+ ret = hv_call_deposit_pages(node, partition_id, 1);
+
+ } while (!ret);
+
+ return ret;
+}
+
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
new file mode 100644
index 000000000000..4421a8d92e23
--- /dev/null
+++ b/arch/x86/hyperv/irqdomain.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
+ *
+ * Authors:
+ * Sunil Muthuswamy <sunilmut@microsoft.com>
+ * Wei Liu <wei.liu@kernel.org>
+ */
+
+#include <linux/pci.h>
+#include <linux/irq.h>
+#include <asm/mshyperv.h>
+
+static int hv_map_interrupt(union hv_device_id device_id, bool level,
+ int cpu, int vector, struct hv_interrupt_entry *entry)
+{
+ struct hv_input_map_device_interrupt *input;
+ struct hv_output_map_device_interrupt *output;
+ struct hv_device_interrupt_descriptor *intr_desc;
+ unsigned long flags;
+ u64 status;
+ int nr_bank, var_size;
+
+ local_irq_save(flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ intr_desc = &input->interrupt_descriptor;
+ memset(input, 0, sizeof(*input));
+ input->partition_id = hv_current_partition_id;
+ input->device_id = device_id.as_uint64;
+ intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+ intr_desc->vector_count = 1;
+ intr_desc->target.vector = vector;
+
+ if (level)
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL;
+ else
+ intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+ intr_desc->target.vp_set.valid_bank_mask = 0;
+ intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+ if (nr_bank < 0) {
+ local_irq_restore(flags);
+ pr_err("%s: unable to generate VP set\n", __func__);
+ return EINVAL;
+ }
+ intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+ /*
+ * var-sized hypercall, var-size starts after vp_mask (thus
+ * vp_set.format does not count, but vp_set.valid_bank_mask
+ * does).
+ */
+ var_size = nr_bank + 1;
+
+ status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+ input, output);
+ *entry = output->interrupt_entry;
+
+ local_irq_restore(flags);
+
+ if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+ pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+
+ return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+{
+ unsigned long flags;
+ struct hv_input_unmap_device_interrupt *input;
+ struct hv_interrupt_entry *intr_entry;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ memset(input, 0, sizeof(*input));
+ intr_entry = &input->interrupt_entry;
+ input->partition_id = hv_current_partition_id;
+ input->device_id = id;
+ *intr_entry = *old_entry;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ return status & HV_HYPERCALL_RESULT_MASK;
+}
+
+#ifdef CONFIG_PCI_MSI
+struct rid_data {
+ struct pci_dev *bridge;
+ u32 rid;
+};
+
+static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
+{
+ struct rid_data *rd = data;
+ u8 bus = PCI_BUS_NUM(rd->rid);
+
+ if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) {
+ rd->bridge = pdev;
+ rd->rid = alias;
+ }
+
+ return 0;
+}
+
+static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+{
+ union hv_device_id dev_id;
+ struct rid_data data = {
+ .bridge = NULL,
+ .rid = PCI_DEVID(dev->bus->number, dev->devfn)
+ };
+
+ pci_for_each_dma_alias(dev, get_rid_cb, &data);
+
+ dev_id.as_uint64 = 0;
+ dev_id.device_type = HV_DEVICE_TYPE_PCI;
+ dev_id.pci.segment = pci_domain_nr(dev->bus);
+
+ dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+ dev_id.pci.bdf.device = PCI_SLOT(data.rid);
+ dev_id.pci.bdf.function = PCI_FUNC(data.rid);
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+
+ if (data.bridge) {
+ int pos;
+
+ /*
+ * Microsoft Hypervisor requires a bus range when the bridge is
+ * running in PCI-X mode.
+ *
+ * To distinguish conventional vs PCI-X bridge, we can check
+ * the bridge's PCI-X Secondary Status Register, Secondary Bus
+ * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+ * Specification Revision 1.0 5.2.2.1.3.
+ *
+ * Value zero means it is in conventional mode, otherwise it is
+ * in PCI-X mode.
+ */
+
+ pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+ if (pos) {
+ u16 status;
+
+ pci_read_config_word(data.bridge, pos +
+ PCI_X_BRIDGE_SSTATUS, &status);
+
+ if (status & PCI_X_SSTATUS_FREQ) {
+ /* Non-zero, PCI-X mode */
+ u8 sec_bus, sub_bus;
+
+ dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+
+ pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
+ dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
+ pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
+ dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
+ }
+ }
+ }
+
+ return dev_id;
+}
+
+static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector,
+ struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id = hv_build_pci_dev_id(dev);
+
+ return hv_map_interrupt(device_id, false, cpu, vector, entry);
+}
+
+static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+{
+ /* High address is always 0 */
+ msg->address_hi = 0;
+ msg->address_lo = entry->msi_entry.address.as_uint32;
+ msg->data = entry->msi_entry.data.as_uint32;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct msi_desc *msidesc;
+ struct pci_dev *dev;
+ struct hv_interrupt_entry out_entry, *stored_entry;
+ struct irq_cfg *cfg = irqd_cfg(data);
+ cpumask_t *affinity;
+ int cpu;
+ u64 status;
+
+ msidesc = irq_data_get_msi_desc(data);
+ dev = msi_desc_to_pci_dev(msidesc);
+
+ if (!cfg) {
+ pr_debug("%s: cfg is NULL", __func__);
+ return;
+ }
+
+ affinity = irq_data_get_effective_affinity_mask(data);
+ cpu = cpumask_first_and(affinity, cpu_online_mask);
+
+ if (data->chip_data) {
+ /*
+ * This interrupt is already mapped. Let's unmap first.
+ *
+ * We don't use retarget interrupt hypercalls here because
+ * Microsoft Hypervisor doens't allow root to change the vector
+ * or specify VPs outside of the set that is initially used
+ * during mapping.
+ */
+ stored_entry = data->chip_data;
+ data->chip_data = NULL;
+
+ status = hv_unmap_msi_interrupt(dev, stored_entry);
+
+ kfree(stored_entry);
+
+ if (status != HV_STATUS_SUCCESS) {
+ pr_debug("%s: failed to unmap, status %lld", __func__, status);
+ return;
+ }
+ }
+
+ stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC);
+ if (!stored_entry) {
+ pr_debug("%s: failed to allocate chip data\n", __func__);
+ return;
+ }
+
+ status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry);
+ if (status != HV_STATUS_SUCCESS) {
+ kfree(stored_entry);
+ return;
+ }
+
+ *stored_entry = out_entry;
+ data->chip_data = stored_entry;
+ entry_to_msi_msg(&out_entry, msg);
+
+ return;
+}
+
+static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+{
+ return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+}
+
+static void hv_teardown_msi_irq_common(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+ u64 status;
+ struct hv_interrupt_entry old_entry;
+ struct irq_desc *desc;
+ struct irq_data *data;
+ struct msi_msg msg;
+
+ desc = irq_to_desc(irq);
+ if (!desc) {
+ pr_debug("%s: no irq desc\n", __func__);
+ return;
+ }
+
+ data = &desc->irq_data;
+ if (!data) {
+ pr_debug("%s: no irq data\n", __func__);
+ return;
+ }
+
+ if (!data->chip_data) {
+ pr_debug("%s: no chip data\n!", __func__);
+ return;
+ }
+
+ old_entry = *(struct hv_interrupt_entry *)data->chip_data;
+ entry_to_msi_msg(&old_entry, &msg);
+
+ kfree(data->chip_data);
+ data->chip_data = NULL;
+
+ status = hv_unmap_msi_interrupt(dev, &old_entry);
+
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("%s: hypercall failed, status %lld\n", __func__, status);
+ return;
+ }
+}
+
+static void hv_msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ int i;
+ struct msi_desc *entry;
+ struct pci_dev *pdev;
+
+ if (WARN_ON_ONCE(!dev_is_pci(dev)))
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ for_each_pci_msi_entry(entry, pdev) {
+ if (entry->irq) {
+ for (i = 0; i < entry->nvec_used; i++) {
+ hv_teardown_msi_irq_common(pdev, entry, entry->irq + i);
+ irq_domain_free_irqs(entry->irq + i, 1);
+ }
+ }
+ }
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip hv_pci_msi_controller = {
+ .name = "HV-PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = hv_irq_compose_msi_msg,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+ .domain_free_irqs = hv_msi_domain_free_irqs,
+ .msi_prepare = pci_msi_prepare,
+};
+
+static struct msi_domain_info hv_pci_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &pci_msi_domain_ops,
+ .chip = &hv_pci_msi_controller,
+ .handler = handle_edge_irq,
+ .handler_name = "edge",
+};
+
+struct irq_domain * __init hv_create_pci_msi_domain(void)
+{
+ struct irq_domain *d = NULL;
+ struct fwnode_handle *fn;
+
+ fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI");
+ if (fn)
+ d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain);
+
+ /* No point in going further if we can't get an irq domain */
+ BUG_ON(!d);
+
+ return d;
+}
+
+#endif /* CONFIG_PCI_MSI */
+
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_unmap_interrupt(device_id.as_uint64, entry);
+}
+EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
+ struct hv_interrupt_entry *entry)
+{
+ union hv_device_id device_id;
+
+ device_id.as_uint64 = 0;
+ device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
+ device_id.ioapic.ioapic_id = (u8)ioapic_id;
+
+ return hv_map_interrupt(device_id, level, cpu, vector, entry);
+}
+EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
new file mode 100644
index 000000000000..e003a01b7c67
--- /dev/null
+++ b/arch/x86/include/asm/acrn.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_ACRN_H
+#define _ASM_X86_ACRN_H
+
+/*
+ * This CPUID returns feature bitmaps in EAX.
+ * Guest VM uses this to detect the appropriate feature bit.
+ */
+#define ACRN_CPUID_FEATURES 0x40000001
+/* Bit 0 indicates whether guest VM is privileged */
+#define ACRN_FEATURE_PRIVILEGED_VM BIT(0)
+
+void acrn_setup_intr_handler(void (*handler)(void));
+void acrn_remove_intr_handler(void);
+
+static inline u32 acrn_cpuid_base(void)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return hypervisor_cpuid_base("ACRNACRNACRN", 0);
+
+ return 0;
+}
+
+/*
+ * Hypercalls for ACRN
+ *
+ * - VMCALL instruction is used to implement ACRN hypercalls.
+ * - ACRN hypercall ABI:
+ * - Hypercall number is passed in R8 register.
+ * - Up to 2 arguments are passed in RDI, RSI.
+ * - Return value will be placed in RAX.
+ *
+ * Because GCC doesn't support R8 register as direct register constraints, use
+ * supported constraint as input with a explicit MOV to R8 in beginning of asm.
+ */
+static inline long acrn_hypercall0(unsigned long hcall_id)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id)
+ : "r8", "memory");
+
+ return result;
+}
+
+static inline long acrn_hypercall1(unsigned long hcall_id,
+ unsigned long param1)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id), "D" (param1)
+ : "r8", "memory");
+
+ return result;
+}
+
+static inline long acrn_hypercall2(unsigned long hcall_id,
+ unsigned long param1,
+ unsigned long param2)
+{
+ long result;
+
+ asm volatile("movl %1, %%r8d\n\t"
+ "vmcall\n\t"
+ : "=a" (result)
+ : "g" (hcall_id), "D" (param1), "S" (param2)
+ : "r8", "memory");
+
+ return result;
+}
+
+#endif /* _ASM_X86_ACRN_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
deleted file mode 100644
index 87ce8e963215..000000000000
--- a/arch/x86/include/asm/apb_timer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
- *
- * (C) Copyright 2009 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * Note:
- */
-
-#ifndef ASM_X86_APBT_H
-#define ASM_X86_APBT_H
-#include <linux/sfi.h>
-
-#ifdef CONFIG_APB_TIMER
-
-/* default memory mapped register base */
-#define LNW_SCU_ADDR 0xFF100000
-#define LNW_EXT_TIMER_OFFSET 0x1B800
-#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
-#define LNW_EXT_TIMER_PGOFFSET 0x800
-
-/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
-#define APBT_MAX_FREQ 50000000
-#define APBT_MIN_FREQ 1000000
-#define APBT_MMAP_SIZE 1024
-
-extern void apbt_time_init(void);
-extern void apbt_setup_secondary_clock(void);
-
-extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
-extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
-extern int sfi_mtimer_num;
-
-#else /* CONFIG_APB_TIMER */
-
-static inline void apbt_time_init(void) { }
-
-#endif
-#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index f145e3326c6d..be09c7eac89f 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -159,17 +159,6 @@ struct compat_shmid64_ds {
compat_ulong_t __unused5;
};
-/*
- * The type of struct elf_prstatus.pr_reg in compatible core dumps.
- */
-typedef struct user_regs_struct compat_elf_gregset_t;
-
-/* Full regset -- prstatus on x32, otherwise on ia32 */
-#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
-#define SET_PR_FPVALID(S, V, R) \
- do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
- while (0)
-
#ifdef CONFIG_X86_X32_ABI
#define COMPAT_USE_64BIT_TIME \
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 59bf91c57aa8..1728d4ce5730 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -30,6 +30,7 @@ enum cpuid_leafs
CPUID_7_ECX,
CPUID_8000_0007_EBX,
CPUID_7_EDX,
+ CPUID_8000_001F_EAX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -88,8 +89,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -111,8 +113,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 19))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 20))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 84b887825f12..cc96e26d69f7 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 19 /* N 32-bit words worth of info */
+#define NCAPINTS 20 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -96,7 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-#define X86_FEATURE_SME_COHERENT ( 3*32+17) /* "" AMD hardware-enforced cache coherency */
+/* FREE! ( 3*32+17) */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -201,7 +201,7 @@
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
+/* FREE! ( 7*32+10) */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
@@ -211,7 +211,7 @@
#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */
#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
-#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */
+/* FREE! ( 7*32+20) */
#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
#define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
@@ -236,8 +236,6 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
-#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
-#define X86_FEATURE_VM_PAGE_FLUSH ( 8*32+21) /* "" VM Page Flush MSR is supported */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -294,6 +292,7 @@
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
@@ -337,6 +336,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
+#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
@@ -385,6 +385,13 @@
#define X86_FEATURE_CORE_CAPABILITIES (18*32+30) /* "" IA32_CORE_CAPABILITIES MSR */
#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
+/* AMD-defined memory encryption features, CPUID level 0x8000001f (EAX), word 19 */
+#define X86_FEATURE_SME (19*32+ 0) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
+#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
+#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
deleted file mode 100644
index 777c0f63418c..000000000000
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Shared glue code for 128bit block ciphers
- */
-
-#ifndef _CRYPTO_GLUE_HELPER_H
-#define _CRYPTO_GLUE_HELPER_H
-
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <asm/fpu/api.h>
-#include <crypto/b128ops.h>
-
-typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src);
-typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src);
-typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-typedef void (*common_glue_xts_func_t)(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-struct common_glue_func_entry {
- unsigned int num_blocks; /* number of blocks that @fn will process */
- union {
- common_glue_func_t ecb;
- common_glue_cbc_func_t cbc;
- common_glue_ctr_func_t ctr;
- common_glue_xts_func_t xts;
- } fn_u;
-};
-
-struct common_glue_ctx {
- unsigned int num_funcs;
- int fpu_blocks_limit; /* -1 means fpu not needed at all */
-
- /*
- * First funcs entry must have largest num_blocks and last funcs entry
- * must have num_blocks == 1!
- */
- struct common_glue_func_entry funcs[];
-};
-
-static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
- struct skcipher_walk *walk,
- bool fpu_enabled, unsigned int nbytes)
-{
- if (likely(fpu_blocks_limit < 0))
- return false;
-
- if (fpu_enabled)
- return true;
-
- /*
- * Vector-registers are only used when chunk to be processed is large
- * enough, so do not enable FPU until it is necessary.
- */
- if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
- return false;
-
- /* prevent sleeping if FPU is in use */
- skcipher_walk_atomise(walk);
-
- kernel_fpu_begin();
- return true;
-}
-
-static inline void glue_fpu_end(bool fpu_enabled)
-{
- if (fpu_enabled)
- kernel_fpu_end();
-}
-
-static inline void le128_to_be128(be128 *dst, const le128 *src)
-{
- dst->a = cpu_to_be64(le64_to_cpu(src->a));
- dst->b = cpu_to_be64(le64_to_cpu(src->b));
-}
-
-static inline void be128_to_le128(le128 *dst, const be128 *src)
-{
- dst->a = cpu_to_le64(be64_to_cpu(src->a));
- dst->b = cpu_to_le64(be64_to_cpu(src->b));
-}
-
-static inline void le128_inc(le128 *i)
-{
- u64 a = le64_to_cpu(i->a);
- u64 b = le64_to_cpu(i->b);
-
- b++;
- if (!b)
- a++;
-
- i->a = cpu_to_le64(a);
- i->b = cpu_to_le64(b);
-}
-
-extern int glue_ecb_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req);
-
-extern int glue_cbc_encrypt_req_128bit(const common_glue_func_t fn,
- struct skcipher_request *req);
-
-extern int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req);
-
-extern int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req);
-
-extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
- struct skcipher_request *req,
- common_glue_func_t tweak_fn, void *tweak_ctx,
- void *crypt_ctx, bool decrypt);
-
-extern void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv,
- common_glue_func_t fn);
-
-#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
deleted file mode 100644
index 251c2c89d7cf..000000000000
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_SERPENT_AVX_H
-#define ASM_X86_SERPENT_AVX_H
-
-#include <crypto/b128ops.h>
-#include <crypto/serpent.h>
-#include <linux/types.h>
-
-struct crypto_skcipher;
-
-#define SERPENT_PARALLEL_BLOCKS 8
-
-struct serpent_xts_ctx {
- struct serpent_ctx tweak_ctx;
- struct serpent_ctx crypt_ctx;
-};
-
-asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
- const u8 *src);
-
-asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-
-extern void __serpent_crypt_ctr(const void *ctx, u8 *dst, const u8 *src,
- le128 *iv);
-
-extern void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv);
-extern void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv);
-
-extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen);
-
-#endif
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 7947cb1782da..b7dd944dc867 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -91,6 +91,7 @@
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define DISABLED_MASK19 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c98f78330b09..4d0b126835b8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -12,6 +12,7 @@
#include <linux/pgtable.h>
extern unsigned long efi_fw_vendor, efi_config_table;
+extern unsigned long efi_mixed_mode_stack_pa;
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -68,17 +69,33 @@ extern unsigned long efi_fw_vendor, efi_config_table;
#f " called with too many arguments (" #p ">" #n ")"); \
})
+static inline void efi_fpu_begin(void)
+{
+ /*
+ * The UEFI calling convention (UEFI spec 2.3.2 and 2.3.4) requires
+ * that FCW and MXCSR (64-bit) must be initialized prior to calling
+ * UEFI code. (Oddly the spec does not require that the FPU stack
+ * be empty.)
+ */
+ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+}
+
+static inline void efi_fpu_end(void)
+{
+ kernel_fpu_end();
+}
+
#ifdef CONFIG_X86_32
#define arch_efi_call_virt_setup() \
({ \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
})
#define arch_efi_call_virt_teardown() \
({ \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#define arch_efi_call_virt(p, f, args...) p->f(args)
@@ -94,22 +111,12 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
__efi_call(__VA_ARGS__); \
})
-/*
- * struct efi_scratch - Scratch space used while switching to/from efi_mm
- * @phys_stack: stack used during EFI Mixed Mode
- * @prev_mm: store/restore stolen mm_struct while switching to/from efi_mm
- */
-struct efi_scratch {
- u64 phys_stack;
- struct mm_struct *prev_mm;
-} __packed;
-
#define arch_efi_call_virt_setup() \
({ \
efi_sync_low_kernel_mappings(); \
- kernel_fpu_begin(); \
+ efi_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
- efi_switch_mm(&efi_mm); \
+ efi_enter_mm(); \
})
#define arch_efi_call_virt(p, f, args...) \
@@ -117,9 +124,9 @@ struct efi_scratch {
#define arch_efi_call_virt_teardown() \
({ \
- efi_switch_mm(efi_scratch.prev_mm); \
+ efi_leave_mm(); \
firmware_restrict_branch_speculation_end(); \
- kernel_fpu_end(); \
+ efi_fpu_end(); \
})
#ifdef CONFIG_KASAN
@@ -136,7 +143,6 @@ struct efi_scratch {
#endif /* CONFIG_X86_32 */
-extern struct efi_scratch efi_scratch;
extern int __init efi_memblock_x86_reserve_range(void);
extern void __init efi_print_memmap(void);
extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -149,10 +155,12 @@ extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
extern void efi_delete_dummy_variable(void);
-extern void efi_switch_mm(struct mm_struct *mm);
-extern void efi_recover_from_page_fault(unsigned long phys_addr);
+extern void efi_crash_gracefully_on_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
+void efi_enter_mm(void);
+void efi_leave_mm(void);
+
/* kexec external ABI */
struct efi_setup_data {
u64 fw_vendor;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 66bdfe838d61..9224d40cdefe 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -364,7 +364,7 @@ do { \
#define COMPAT_ARCH_DLINFO \
if (exec->e_machine == EM_X86_64) \
ARCH_DLINFO_X32; \
-else \
+else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \
ARCH_DLINFO_IA32
#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h
new file mode 100644
index 000000000000..f1b6c7a8d8fc
--- /dev/null
+++ b/arch/x86/include/asm/elfcore-compat.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_ELFCORE_COMPAT_H
+#define _ASM_X86_ELFCORE_COMPAT_H
+
+#include <asm/user32.h>
+
+/*
+ * On amd64 we have two 32bit ABIs - i386 and x32. The latter
+ * has bigger registers, so we use it for compat_elf_regset_t.
+ * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID
+ * are used to choose the size and location of ->pr_fpvalid of
+ * the layout actually used.
+ */
+typedef struct user_regs_struct compat_elf_gregset_t;
+
+struct i386_elf_prstatus
+{
+ struct compat_elf_prstatus_common common;
+ struct user_regs_struct32 pr_reg;
+ compat_int_t pr_fpvalid;
+};
+
+#define PRSTATUS_SIZE \
+ (user_64bit_mode(task_pt_regs(current)) \
+ ? sizeof(struct compat_elf_prstatus) \
+ : sizeof(struct i386_elf_prstatus))
+#define SET_PR_FPVALID(S) \
+ (*(user_64bit_mode(task_pt_regs(current)) \
+ ? &(S)->pr_fpvalid \
+ : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1)
+
+#endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 9f1a0a987e5e..d0dcefb5cc59 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -108,9 +108,6 @@ enum fixed_addresses {
#ifdef CONFIG_PARAVIRT_XXL
FIX_PARAVIRT_BOOTMAP,
#endif
-#ifdef CONFIG_X86_INTEL_MID
- FIX_LNW_VRTC,
-#endif
#ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 67a4f1cb2aac..ed33a14188f6 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -32,7 +32,19 @@ extern void fpregs_mark_activate(void);
/* Code that is unaware of kernel_fpu_begin_mask() can use this */
static inline void kernel_fpu_begin(void)
{
+#ifdef CONFIG_X86_64
+ /*
+ * Any 64-bit code that uses 387 instructions must explicitly request
+ * KFPU_387.
+ */
+ kernel_fpu_begin_mask(KFPU_MXCSR);
+#else
+ /*
+ * 32-bit kernel code may use 387 operations as well as SSE2, etc,
+ * as long as it checks that the CPU has the required capability.
+ */
kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR);
+#endif
}
/*
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 6bf42aed387e..e6cd3fee562b 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -21,7 +21,9 @@
#define HYPERV_CPUID_FEATURES 0x40000003
#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
+#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007
#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A
+#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C
#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081
#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */
@@ -111,6 +113,15 @@
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
/*
+ * CPU management features identification.
+ * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
+ */
+#define HV_X64_START_LOGICAL_PROCESSOR BIT(0)
+#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1)
+#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2)
+#define HV_X64_RESERVED_IDENTITY_BIT BIT(31)
+
+/*
* Virtual processor will never share a physical core with another virtual
* processor, except for virtual processors that are reported as sibling SMT
* threads.
@@ -122,6 +133,20 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */
+#define HV_PARAVISOR_PRESENT BIT(0)
+
+/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */
+#define HV_ISOLATION_TYPE GENMASK(3, 0)
+#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5)
+#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6)
+
+enum hv_isolation_type {
+ HV_ISOLATION_TYPE_NONE = 0,
+ HV_ISOLATION_TYPE_VBS = 1,
+ HV_ISOLATION_TYPE_SNP = 2
+};
+
/* Hyper-V specific model specific registers (MSRs) */
/* MSR used to identify the guest OS. */
@@ -523,6 +548,19 @@ struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
+enum hv_interrupt_type {
+ HV_X64_INTERRUPT_TYPE_FIXED = 0x0000,
+ HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001,
+ HV_X64_INTERRUPT_TYPE_SMI = 0x0002,
+ HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003,
+ HV_X64_INTERRUPT_TYPE_NMI = 0x0004,
+ HV_X64_INTERRUPT_TYPE_INIT = 0x0005,
+ HV_X64_INTERRUPT_TYPE_SIPI = 0x0006,
+ HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007,
+ HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008,
+ HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009,
+ HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A,
+};
#include <asm-generic/hyperv-tlfs.h>
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index f656aabd1545..5eb3bdf36a41 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -187,23 +187,22 @@ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
* has to be done in the function body if necessary.
*/
#define DEFINE_IDTENTRY_IRQ(func) \
-static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
+static void __##func(struct pt_regs *regs, u32 vector); \
\
__visible noinstr void func(struct pt_regs *regs, \
unsigned long error_code) \
{ \
irqentry_state_t state = irqentry_enter(regs); \
+ u32 vector = (u32)(u8)error_code; \
\
instrumentation_begin(); \
- irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
- __##func (regs, (u8)error_code); \
- irq_exit_rcu(); \
+ run_irq_on_irqstack_cond(__##func, regs, vector); \
instrumentation_end(); \
irqentry_exit(regs, state); \
} \
\
-static __always_inline void __##func(struct pt_regs *regs, u8 vector)
+static noinline void __##func(struct pt_regs *regs, u32 vector)
/**
* DECLARE_IDTENTRY_SYSVEC - Declare functions for system vector entry points
@@ -237,10 +236,8 @@ __visible noinstr void func(struct pt_regs *regs) \
irqentry_state_t state = irqentry_enter(regs); \
\
instrumentation_begin(); \
- irq_enter_rcu(); \
kvm_set_cpu_l1tf_flush_l1d(); \
run_sysvec_on_irqstack_cond(__##func, regs); \
- irq_exit_rcu(); \
instrumentation_end(); \
irqentry_exit(regs, state); \
} \
@@ -585,6 +582,9 @@ DECLARE_IDTENTRY_MCE(X86_TRAP_MC, exc_machine_check);
#else
DECLARE_IDTENTRY_RAW(X86_TRAP_MC, exc_machine_check);
#endif
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
+#endif
#endif
/* NMI */
@@ -605,6 +605,9 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+#ifdef CONFIG_XEN_PV
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
+#endif
/* #VC */
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index a0f839aa144d..98b4dae5e8bc 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -23,6 +23,8 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
+int insn_fetch_from_user_inatomic(struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE]);
bool insn_decode(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index a8c3d284fa46..95a448fbb44c 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -7,9 +7,12 @@
* Copyright (C) IBM Corporation, 2009
*/
+#include <asm/byteorder.h>
/* insn_attr_t is defined in inat.h */
#include <asm/inat.h>
+#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
+
struct insn_field {
union {
insn_value_t value;
@@ -20,6 +23,48 @@ struct insn_field {
unsigned char nbytes;
};
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+}
+
+#else
+
+struct insn_field {
+ insn_value_t value;
+ union {
+ insn_value_t little;
+ insn_byte_t bytes[4];
+ };
+ /* !0 if we've run insn_get_xxx() for this field */
+ unsigned char got;
+ unsigned char nbytes;
+};
+
+static inline void insn_field_set(struct insn_field *p, insn_value_t v,
+ unsigned char n)
+{
+ p->value = v;
+ p->little = __cpu_to_le32(v);
+ p->nbytes = n;
+}
+
+static inline void insn_set_byte(struct insn_field *p, unsigned char n,
+ insn_byte_t v)
+{
+ p->bytes[n] = v;
+ p->value = __le32_to_cpu(p->little);
+}
+#endif
+
struct insn {
struct insn_field prefixes; /*
* Prefixes
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index cf0e25f45422..c201083b34f6 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -1,15 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * intel-mid.h: Intel MID specific setup code
+ * Intel MID specific setup code
*
- * (C) Copyright 2009 Intel Corporation
+ * (C) Copyright 2009, 2021 Intel Corporation
*/
#ifndef _ASM_X86_INTEL_MID_H
#define _ASM_X86_INTEL_MID_H
-#include <linux/sfi.h>
#include <linux/pci.h>
-#include <linux/platform_device.h>
extern int intel_mid_pci_init(void);
extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
@@ -22,93 +20,18 @@ extern void intel_mid_pwr_power_off(void);
extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
-extern int get_gpio_by_name(const char *name);
-extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
-extern int __init sfi_parse_mtmr(struct sfi_table_header *table);
-extern int sfi_mrtc_num;
-extern struct sfi_rtc_table_entry sfi_mrtc_array[];
-
-/*
- * Here defines the array of devices platform data that IAFW would export
- * through SFI "DEVS" table, we use name and type to match the device and
- * its platform data.
- */
-struct devs_id {
- char name[SFI_NAME_LEN + 1];
- u8 type;
- u8 delay;
- u8 msic;
- void *(*get_platform_data)(void *info);
-};
-
-#define sfi_device(i) \
- static const struct devs_id *const __intel_mid_sfi_##i##_dev __used \
- __section(".x86_intel_mid_dev.init") = &i
-
-/**
-* struct mid_sd_board_info - template for SD device creation
-* @name: identifies the driver
-* @bus_num: board-specific identifier for a given SD controller
-* @max_clk: the maximum frequency device supports
-* @platform_data: the particular data stored there is driver-specific
-*/
-struct mid_sd_board_info {
- char name[SFI_NAME_LEN];
- int bus_num;
- unsigned short addr;
- u32 max_clk;
- void *platform_data;
-};
-
-/*
- * Medfield is the follow-up of Moorestown, it combines two chip solution into
- * one. Other than that it also added always-on and constant tsc and lapic
- * timers. Medfield is the platform name, and the chip name is called Penwell
- * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be
- * identified via MSRs.
- */
-enum intel_mid_cpu_type {
- /* 1 was Moorestown */
- INTEL_MID_CPU_CHIP_PENWELL = 2,
- INTEL_MID_CPU_CHIP_CLOVERVIEW,
- INTEL_MID_CPU_CHIP_TANGIER,
-};
-
-extern enum intel_mid_cpu_type __intel_mid_cpu_chip;
-
#ifdef CONFIG_X86_INTEL_MID
-static inline enum intel_mid_cpu_type intel_mid_identify_cpu(void)
-{
- return __intel_mid_cpu_chip;
-}
-
-static inline bool intel_mid_has_msic(void)
-{
- return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL);
-}
-
extern void intel_scu_devices_create(void);
extern void intel_scu_devices_destroy(void);
#else /* !CONFIG_X86_INTEL_MID */
-#define intel_mid_identify_cpu() 0
-#define intel_mid_has_msic() 0
-
static inline void intel_scu_devices_create(void) { }
static inline void intel_scu_devices_destroy(void) { }
#endif /* !CONFIG_X86_INTEL_MID */
-enum intel_mid_timer_options {
- INTEL_MID_TIMER_DEFAULT,
- INTEL_MID_TIMER_APBT_ONLY,
- INTEL_MID_TIMER_LAPIC_APBT,
-};
-
-extern enum intel_mid_timer_options intel_mid_timer_options;
-
/* Bus Select SoC Fuse value */
#define BSEL_SOC_FUSE_MASK 0x7
/* FSB 133MHz */
@@ -118,16 +41,4 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
/* FSB 83MHz */
#define BSEL_SOC_FUSE_111 0x7
-#define SFI_MTMR_MAX_NUM 8
-#define SFI_MRTC_MAX 8
-
-/* VRTC timer */
-#define MRST_VRTC_MAP_SZ 1024
-/* #define MRST_VRTC_PGOFFSET 0xc00 */
-
-extern void intel_mid_rtc_init(void);
-
-/* The offset for the mapping of global gpio pin to irq */
-#define INTEL_MID_IRQ_OFFSET 0x100
-
#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/intel_mid_vrtc.h b/arch/x86/include/asm/intel_mid_vrtc.h
deleted file mode 100644
index 0b44b1abe4d9..000000000000
--- a/arch/x86/include/asm/intel_mid_vrtc.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _INTEL_MID_VRTC_H
-#define _INTEL_MID_VRTC_H
-
-extern unsigned char vrtc_cmos_read(unsigned char reg);
-extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
-extern void vrtc_get_time(struct timespec64 *now);
-extern int vrtc_set_mmss(const struct timespec64 *now);
-
-#endif
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 11d457af68c5..8537f597d20a 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -65,6 +65,4 @@ static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int c
inlen, out, outlen);
}
-#include <asm/intel_scu_ipc_legacy.h>
-
#endif
diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h
deleted file mode 100644
index 4cf13fecb673..000000000000
--- a/arch/x86/include/asm/intel_scu_ipc_legacy.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
-#define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_
-
-#include <linux/notifier.h>
-
-#define IPCMSG_INDIRECT_READ 0x02
-#define IPCMSG_INDIRECT_WRITE 0x05
-
-#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
-
-#define IPCMSG_WARM_RESET 0xF0
-#define IPCMSG_COLD_RESET 0xF1
-#define IPCMSG_SOFT_RESET 0xF2
-#define IPCMSG_COLD_BOOT 0xF3
-
-#define IPCMSG_VRTC 0xFA /* Set vRTC device */
-/* Command id associated with message IPCMSG_VRTC */
-#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
-#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
-
-/* Don't call these in new code - they will be removed eventually */
-
-/* Read single register */
-static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data)
-{
- return intel_scu_ipc_dev_ioread8(NULL, addr, data);
-}
-
-/* Read a vector */
-static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
-{
- return intel_scu_ipc_dev_readv(NULL, addr, data, len);
-}
-
-/* Write single register */
-static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data)
-{
- return intel_scu_ipc_dev_iowrite8(NULL, addr, data);
-}
-
-/* Write a vector */
-static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
-{
- return intel_scu_ipc_dev_writev(NULL, addr, data, len);
-}
-
-/* Update single register based on the mask */
-static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask)
-{
- return intel_scu_ipc_dev_update(NULL, addr, data, mask);
-}
-
-/* Issue commands to the SCU with or without data */
-static inline int intel_scu_ipc_simple_command(int cmd, int sub)
-{
- return intel_scu_ipc_dev_simple_command(NULL, cmd, sub);
-}
-
-static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
- u32 *out, int outlen)
-{
- /* New API takes both inlen and outlen as bytes so convert here */
- size_t inbytes = inlen * sizeof(u32);
- size_t outbytes = outlen * sizeof(u32);
-
- return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes,
- inlen, out, outbytes);
-}
-
-extern struct blocking_notifier_head intel_scu_notifier;
-
-static inline void intel_scu_notifier_add(struct notifier_block *nb)
-{
- blocking_notifier_chain_register(&intel_scu_notifier, nb);
-}
-
-static inline void intel_scu_notifier_remove(struct notifier_block *nb)
-{
- blocking_notifier_chain_unregister(&intel_scu_notifier, nb);
-}
-
-static inline int intel_scu_notifier_post(unsigned long v, void *p)
-{
- return blocking_notifier_call_chain(&intel_scu_notifier, v, p);
-}
-
-#define SCU_AVAILABLE 1
-#define SCU_DOWN 2
-
-#endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 528c8a71fe7f..768aa234cbb4 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -25,8 +25,6 @@ static inline int irq_canonicalize(int irq)
extern int irq_init_percpu_irqstack(unsigned int cpu);
-#define __ARCH_HAS_DO_SOFTIRQ
-
struct irq_desc;
extern void fixup_irqs(void);
@@ -40,8 +38,6 @@ extern void native_init_IRQ(void);
extern void __handle_irq(struct irq_desc *desc, struct pt_regs *regs);
-extern __visible void do_IRQ(struct pt_regs *regs, unsigned long vector);
-
extern void init_ISA_irqs(void);
extern void __init init_IRQ(void);
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 775816965c6a..9b2a0ff76c73 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -7,100 +7,217 @@
#include <asm/processor.h>
#ifdef CONFIG_X86_64
-static __always_inline bool irqstack_active(void)
-{
- return __this_cpu_read(irq_count) != -1;
-}
-
-void asm_call_on_stack(void *sp, void (*func)(void), void *arg);
-void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs),
- struct pt_regs *regs);
-void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc),
- struct irq_desc *desc);
-static __always_inline void __run_on_irqstack(void (*func)(void))
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_on_stack(tos - 8, func, NULL);
- __this_cpu_sub(irq_count, 1);
+/*
+ * Macro to inline switching to an interrupt stack and invoking function
+ * calls from there. The following rules apply:
+ *
+ * - Ordering:
+ *
+ * 1. Write the stack pointer into the top most place of the irq
+ * stack. This ensures that the various unwinders can link back to the
+ * original stack.
+ *
+ * 2. Switch the stack pointer to the top of the irq stack.
+ *
+ * 3. Invoke whatever needs to be done (@asm_call argument)
+ *
+ * 4. Pop the original stack pointer from the top of the irq stack
+ * which brings it back to the original stack where it left off.
+ *
+ * - Function invocation:
+ *
+ * To allow flexible usage of the macro, the actual function code including
+ * the store of the arguments in the call ABI registers is handed in via
+ * the @asm_call argument.
+ *
+ * - Local variables:
+ *
+ * @tos:
+ * The @tos variable holds a pointer to the top of the irq stack and
+ * _must_ be allocated in a non-callee saved register as this is a
+ * restriction coming from objtool.
+ *
+ * Note, that (tos) is both in input and output constraints to ensure
+ * that the compiler does not assume that R11 is left untouched in
+ * case this macro is used in some place where the per cpu interrupt
+ * stack pointer is used again afterwards
+ *
+ * - Function arguments:
+ * The function argument(s), if any, have to be defined in register
+ * variables at the place where this is invoked. Storing the
+ * argument(s) in the proper register(s) is part of the @asm_call
+ *
+ * - Constraints:
+ *
+ * The constraints have to be done very carefully because the compiler
+ * does not know about the assembly call.
+ *
+ * output:
+ * As documented already above the @tos variable is required to be in
+ * the output constraints to make the compiler aware that R11 cannot be
+ * reused after the asm() statement.
+ *
+ * For builds with CONFIG_UNWIND_FRAME_POINTER ASM_CALL_CONSTRAINT is
+ * required as well as this prevents certain creative GCC variants from
+ * misplacing the ASM code.
+ *
+ * input:
+ * - func:
+ * Immediate, which tells the compiler that the function is referenced.
+ *
+ * - tos:
+ * Register. The actual register is defined by the variable declaration.
+ *
+ * - function arguments:
+ * The constraints are handed in via the 'argconstr' argument list. They
+ * describe the register arguments which are used in @asm_call.
+ *
+ * clobbers:
+ * Function calls can clobber anything except the callee-saved
+ * registers. Tell the compiler.
+ */
+#define call_on_irqstack(func, asm_call, argconstr...) \
+{ \
+ register void *tos asm("r11"); \
+ \
+ tos = ((void *)__this_cpu_read(hardirq_stack_ptr)); \
+ \
+ asm_inline volatile( \
+ "movq %%rsp, (%[tos]) \n" \
+ "movq %[tos], %%rsp \n" \
+ \
+ asm_call \
+ \
+ "popq %%rsp \n" \
+ \
+ : "+r" (tos), ASM_CALL_CONSTRAINT \
+ : [__func] "i" (func), [tos] "r" (tos) argconstr \
+ : "cc", "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", \
+ "memory" \
+ ); \
}
-static __always_inline void
-__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs)
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_sysvec_on_stack(tos - 8, func, regs);
- __this_cpu_sub(irq_count, 1);
+/* Macros to assert type correctness for run_*_on_irqstack macros */
+#define assert_function_type(func, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(&func), proto))
+
+#define assert_arg_type(arg, proto) \
+ static_assert(__builtin_types_compatible_p(typeof(arg), proto))
+
+/*
+ * Macro to invoke system vector and device interrupt C handlers.
+ */
+#define call_on_irqstack_cond(func, regs, asm_call, constr, c_args...) \
+{ \
+ /* \
+ * User mode entry and interrupt on the irq stack do not \
+ * switch stacks. If from user mode the task stack is empty. \
+ */ \
+ if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
+ irq_enter_rcu(); \
+ func(c_args); \
+ irq_exit_rcu(); \
+ } else { \
+ /* \
+ * Mark the irq stack inuse _before_ and unmark _after_ \
+ * switching stacks. Interrupts are disabled in both \
+ * places. Invoke the stack switch macro with the call \
+ * sequence which matches the above direct invocation. \
+ */ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(func, asm_call, constr); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
+ } \
}
-static __always_inline void
-__run_irq_on_irqstack(void (*func)(struct irq_desc *desc),
- struct irq_desc *desc)
-{
- void *tos = __this_cpu_read(hardirq_stack_ptr);
-
- __this_cpu_add(irq_count, 1);
- asm_call_irq_on_stack(tos - 8, func, desc);
- __this_cpu_sub(irq_count, 1);
+/*
+ * Function call sequence for __call_on_irqstack() for system vectors.
+ *
+ * Note that irq_enter_rcu() and irq_exit_rcu() do not use the input
+ * mechanism because these functions are global and cannot be optimized out
+ * when compiling a particular source file which uses one of these macros.
+ *
+ * The argument (regs) does not need to be pushed or stashed in a callee
+ * saved register to be safe vs. the irq_enter_rcu() call because the
+ * clobbers already prevent the compiler from storing it in a callee
+ * clobbered register. As the compiler has to preserve @regs for the final
+ * call to idtentry_exit() anyway, it's likely that it does not cause extra
+ * effort for this asm magic.
+ */
+#define ASM_CALL_SYSVEC \
+ "call irq_enter_rcu \n" \
+ "movq %[arg1], %%rdi \n" \
+ "call %P[__func] \n" \
+ "call irq_exit_rcu \n"
+
+#define SYSVEC_CONSTRAINTS , [arg1] "r" (regs)
+
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_SYSVEC, \
+ SYSVEC_CONSTRAINTS, regs); \
}
-#else /* CONFIG_X86_64 */
-static inline bool irqstack_active(void) { return false; }
-static inline void __run_on_irqstack(void (*func)(void)) { }
-static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs) { }
-static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc),
- struct irq_desc *desc) { }
-#endif /* !CONFIG_X86_64 */
-
-static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs)
-{
- if (IS_ENABLED(CONFIG_X86_32))
- return false;
- if (!regs)
- return !irqstack_active();
- return !user_mode(regs) && !irqstack_active();
+/*
+ * As in ASM_CALL_SYSVEC above the clobbers force the compiler to store
+ * @regs and @vector in callee saved registers.
+ */
+#define ASM_CALL_IRQ \
+ "call irq_enter_rcu \n" \
+ "movq %[arg1], %%rdi \n" \
+ "movl %[arg2], %%esi \n" \
+ "call %P[__func] \n" \
+ "call irq_exit_rcu \n"
+
+#define IRQ_CONSTRAINTS , [arg1] "r" (regs), [arg2] "r" (vector)
+
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ assert_function_type(func, void (*)(struct pt_regs *, u32)); \
+ assert_arg_type(regs, struct pt_regs *); \
+ assert_arg_type(vector, u32); \
+ \
+ call_on_irqstack_cond(func, regs, ASM_CALL_IRQ, \
+ IRQ_CONSTRAINTS, regs, vector); \
}
-
-static __always_inline void run_on_irqstack_cond(void (*func)(void),
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_on_irqstack(func);
- else
- func();
+#define ASM_CALL_SOFTIRQ \
+ "call %P[__func] \n"
+
+/*
+ * Macro to invoke __do_softirq on the irq stack. This is only called from
+ * task context when bottom halfs are about to be reenabled and soft
+ * interrupts are pending to be processed. The interrupt stack cannot be in
+ * use here.
+ */
+#define do_softirq_own_stack() \
+{ \
+ __this_cpu_write(hardirq_stack_inuse, true); \
+ call_on_irqstack(__do_softirq, ASM_CALL_SOFTIRQ); \
+ __this_cpu_write(hardirq_stack_inuse, false); \
}
-static __always_inline void
-run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs),
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_sysvec_on_irqstack(func, regs);
- else
- func(regs);
+#else /* CONFIG_X86_64 */
+/* System vector handlers always run on the stack they interrupted. */
+#define run_sysvec_on_irqstack_cond(func, regs) \
+{ \
+ irq_enter_rcu(); \
+ func(regs); \
+ irq_exit_rcu(); \
}
-static __always_inline void
-run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc,
- struct pt_regs *regs)
-{
- lockdep_assert_irqs_disabled();
-
- if (irq_needs_irq_stack(regs))
- __run_irq_on_irqstack(func, desc);
- else
- func(desc);
+/* Switches to the irq stack within func() */
+#define run_irq_on_irqstack_cond(func, regs, vector) \
+{ \
+ irq_enter_rcu(); \
+ func(regs, vector); \
+ irq_exit_rcu(); \
}
+#endif /* !CONFIG_X86_64 */
+
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 2dfc8d380dab..144d70ea4393 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -35,15 +35,6 @@ extern __always_inline unsigned long native_save_fl(void)
return flags;
}
-extern inline void native_restore_fl(unsigned long flags);
-extern inline void native_restore_fl(unsigned long flags)
-{
- asm volatile("push %0 ; popf"
- : /* no output */
- :"g" (flags)
- :"memory", "cc");
-}
-
static __always_inline void native_irq_disable(void)
{
asm volatile("cli": : :"memory");
@@ -79,11 +70,6 @@ static __always_inline unsigned long arch_local_save_flags(void)
return native_save_fl();
}
-static __always_inline void arch_local_irq_restore(unsigned long flags)
-{
- native_restore_fl(flags);
-}
-
static __always_inline void arch_local_irq_disable(void)
{
native_irq_disable();
@@ -131,25 +117,7 @@ static __always_inline unsigned long arch_local_irq_save(void)
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
-#define SWAPGS swapgs
-/*
- * Currently paravirt can't handle swapgs nicely when we
- * don't have a stack we can rely on (such as a user space
- * stack). So we either find a way around these or just fault
- * and emulate if a guest tries to call swapgs directly.
- *
- * Either way, this is a good way to document that we don't
- * have a reliable stack. x86_64 only.
- */
-#define SWAPGS_UNSAFE_STACK swapgs
-
#define INTERRUPT_RETURN jmp native_iret
-#define USERGS_SYSRET64 \
- swapgs; \
- sysretq;
-#define USERGS_SYSRET32 \
- swapgs; \
- sysretl
#else
#define INTERRUPT_RETURN iret
@@ -170,6 +138,20 @@ static __always_inline int arch_irqs_disabled(void)
return arch_irqs_disabled_flags(flags);
}
+
+static __always_inline void arch_local_irq_restore(unsigned long flags)
+{
+ if (!arch_irqs_disabled_flags(flags))
+ arch_local_irq_enable();
+}
+#else
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PV
+#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV
+#else
+#define SWAPGS swapgs
+#endif
+#endif
#endif /* !__ASSEMBLY__ */
#endif
diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h
new file mode 100644
index 000000000000..97bbb4a9083a
--- /dev/null
+++ b/arch/x86/include/asm/kfence.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * x86 KFENCE support.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef _ASM_X86_KFENCE_H
+#define _ASM_X86_KFENCE_H
+
+#include <linux/bug.h>
+#include <linux/kfence.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+#include <asm/tlbflush.h>
+
+/* Force 4K pages for __kfence_pool. */
+static inline bool arch_kfence_init_pool(void)
+{
+ unsigned long addr;
+
+ for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr);
+ addr += PAGE_SIZE) {
+ unsigned int level;
+
+ if (!lookup_address(addr, &level))
+ return false;
+
+ if (level != PG_LEVEL_4K)
+ set_memory_4k(addr, 1);
+ }
+
+ return true;
+}
+
+/* Protect the given page and flush TLB. */
+static inline bool kfence_protect_page(unsigned long addr, bool protect)
+{
+ unsigned int level;
+ pte_t *pte = lookup_address(addr, &level);
+
+ if (WARN_ON(!pte || level != PG_LEVEL_4K))
+ return false;
+
+ /*
+ * We need to avoid IPIs, as we may get KFENCE allocations or faults
+ * with interrupts disabled. Therefore, the below is best-effort, and
+ * does not flush TLBs on all CPUs. We can tolerate some inaccuracy;
+ * lazy fault handling takes care of faults after the page is PRESENT.
+ */
+
+ if (protect)
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ else
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+
+ /* Flush this CPU's TLB. */
+ flush_tlb_one_kernel(addr);
+ return true;
+}
+
+#endif /* _ASM_X86_KFENCE_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 991a7ad540c7..d20a3d6be36e 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -58,14 +58,17 @@ struct arch_specific_insn {
/* copy of the original instruction */
kprobe_opcode_t *insn;
/*
- * boostable = false: This instruction type is not boostable.
- * boostable = true: This instruction has been boosted: we have
+ * boostable = 0: This instruction type is not boostable.
+ * boostable = 1: This instruction has been boosted: we have
* added a relative jump after the instruction copy in insn,
* so no single-step and fixup are needed (unless there's
* a post_handler).
*/
- bool boostable;
- bool if_modifier;
+ unsigned boostable:1;
+ unsigned if_modifier:1;
+ unsigned is_call:1;
+ unsigned is_pushf:1;
+ unsigned is_abs_ip:1;
/* Number of bytes of text poked */
int tp_len;
};
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
new file mode 100644
index 000000000000..323641097f63
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_OP) || !defined(KVM_X86_OP_NULL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_OP() and KVM_X86_OP_NULL() are used to help generate
+ * "static_call()"s. They are also intended for use when defining
+ * the vmx/svm kvm_x86_ops. KVM_X86_OP() can be used for those
+ * functions that follow the [svm|vmx]_func_name convention.
+ * KVM_X86_OP_NULL() can leave a NULL definition for the
+ * case where there is no definition or a function name that
+ * doesn't match the typical naming convention is supplied.
+ */
+KVM_X86_OP_NULL(hardware_enable)
+KVM_X86_OP_NULL(hardware_disable)
+KVM_X86_OP_NULL(hardware_unsetup)
+KVM_X86_OP_NULL(cpu_has_accelerated_tpr)
+KVM_X86_OP(has_emulated_msr)
+KVM_X86_OP(vcpu_after_set_cpuid)
+KVM_X86_OP(vm_init)
+KVM_X86_OP_NULL(vm_destroy)
+KVM_X86_OP(vcpu_create)
+KVM_X86_OP(vcpu_free)
+KVM_X86_OP(vcpu_reset)
+KVM_X86_OP(prepare_guest_switch)
+KVM_X86_OP(vcpu_load)
+KVM_X86_OP(vcpu_put)
+KVM_X86_OP(update_exception_bitmap)
+KVM_X86_OP(get_msr)
+KVM_X86_OP(set_msr)
+KVM_X86_OP(get_segment_base)
+KVM_X86_OP(get_segment)
+KVM_X86_OP(get_cpl)
+KVM_X86_OP(set_segment)
+KVM_X86_OP_NULL(get_cs_db_l_bits)
+KVM_X86_OP(set_cr0)
+KVM_X86_OP(is_valid_cr4)
+KVM_X86_OP(set_cr4)
+KVM_X86_OP(set_efer)
+KVM_X86_OP(get_idt)
+KVM_X86_OP(set_idt)
+KVM_X86_OP(get_gdt)
+KVM_X86_OP(set_gdt)
+KVM_X86_OP(sync_dirty_debug_regs)
+KVM_X86_OP(set_dr7)
+KVM_X86_OP(cache_reg)
+KVM_X86_OP(get_rflags)
+KVM_X86_OP(set_rflags)
+KVM_X86_OP(tlb_flush_all)
+KVM_X86_OP(tlb_flush_current)
+KVM_X86_OP_NULL(tlb_remote_flush)
+KVM_X86_OP_NULL(tlb_remote_flush_with_range)
+KVM_X86_OP(tlb_flush_gva)
+KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(run)
+KVM_X86_OP_NULL(handle_exit)
+KVM_X86_OP_NULL(skip_emulated_instruction)
+KVM_X86_OP_NULL(update_emulated_instruction)
+KVM_X86_OP(set_interrupt_shadow)
+KVM_X86_OP(get_interrupt_shadow)
+KVM_X86_OP(patch_hypercall)
+KVM_X86_OP(set_irq)
+KVM_X86_OP(set_nmi)
+KVM_X86_OP(queue_exception)
+KVM_X86_OP(cancel_injection)
+KVM_X86_OP(interrupt_allowed)
+KVM_X86_OP(nmi_allowed)
+KVM_X86_OP(get_nmi_mask)
+KVM_X86_OP(set_nmi_mask)
+KVM_X86_OP(enable_nmi_window)
+KVM_X86_OP(enable_irq_window)
+KVM_X86_OP(update_cr8_intercept)
+KVM_X86_OP(check_apicv_inhibit_reasons)
+KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
+KVM_X86_OP(refresh_apicv_exec_ctrl)
+KVM_X86_OP(hwapic_irr_update)
+KVM_X86_OP(hwapic_isr_update)
+KVM_X86_OP_NULL(guest_apic_has_interrupt)
+KVM_X86_OP(load_eoi_exitmap)
+KVM_X86_OP(set_virtual_apic_mode)
+KVM_X86_OP_NULL(set_apic_access_page_addr)
+KVM_X86_OP(deliver_posted_interrupt)
+KVM_X86_OP_NULL(sync_pir_to_irr)
+KVM_X86_OP(set_tss_addr)
+KVM_X86_OP(set_identity_map_addr)
+KVM_X86_OP(get_mt_mask)
+KVM_X86_OP(load_mmu_pgd)
+KVM_X86_OP_NULL(has_wbinvd_exit)
+KVM_X86_OP(write_l1_tsc_offset)
+KVM_X86_OP(get_exit_info)
+KVM_X86_OP(check_intercept)
+KVM_X86_OP(handle_exit_irqoff)
+KVM_X86_OP_NULL(request_immediate_exit)
+KVM_X86_OP(sched_in)
+KVM_X86_OP_NULL(update_cpu_dirty_logging)
+KVM_X86_OP_NULL(pre_block)
+KVM_X86_OP_NULL(post_block)
+KVM_X86_OP_NULL(vcpu_blocking)
+KVM_X86_OP_NULL(vcpu_unblocking)
+KVM_X86_OP_NULL(update_pi_irte)
+KVM_X86_OP_NULL(apicv_post_state_restore)
+KVM_X86_OP_NULL(dy_apicv_has_pending_interrupt)
+KVM_X86_OP_NULL(set_hv_timer)
+KVM_X86_OP_NULL(cancel_hv_timer)
+KVM_X86_OP(setup_mce)
+KVM_X86_OP(smi_allowed)
+KVM_X86_OP(pre_enter_smm)
+KVM_X86_OP(pre_leave_smm)
+KVM_X86_OP(enable_smi_window)
+KVM_X86_OP_NULL(mem_enc_op)
+KVM_X86_OP_NULL(mem_enc_reg_region)
+KVM_X86_OP_NULL(mem_enc_unreg_region)
+KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(can_emulate_instruction)
+KVM_X86_OP(apic_init_signal_blocked)
+KVM_X86_OP_NULL(enable_direct_tlbflush)
+KVM_X86_OP_NULL(migrate_timers)
+KVM_X86_OP(msr_filter_changed)
+KVM_X86_OP_NULL(complete_emulated_msr)
+
+#undef KVM_X86_OP
+#undef KVM_X86_OP_NULL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3d6616f6f6ef..3768819693e5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,10 +40,8 @@
#define KVM_MAX_VCPUS 288
#define KVM_SOFT_MAX_VCPUS 240
#define KVM_MAX_VCPU_ID 1023
-#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
-#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
#define KVM_HALT_POLL_NS_DEFAULT 200000
@@ -52,6 +50,9 @@
#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
KVM_DIRTY_LOG_INITIALLY_SET)
+#define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \
+ KVM_BUS_LOCK_DETECTION_EXIT)
+
/* x86-specific vcpu->requests bit members */
#define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0)
#define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1)
@@ -88,6 +89,8 @@
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
+#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
+ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -200,9 +203,17 @@ enum x86_intercept_stage;
#define DR6_BS (1 << 14)
#define DR6_BT (1 << 15)
#define DR6_RTM (1 << 16)
-#define DR6_FIXED_1 0xfffe0ff0
-#define DR6_INIT 0xffff0ff0
+/*
+ * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
+ * We can regard all the bits in DR6_FIXED_1 as active_low bits;
+ * they will never be 0 for now, but when they are defined
+ * in the future it will require no code change.
+ *
+ * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
+ */
+#define DR6_ACTIVE_LOW 0xffff0ff0
#define DR6_VOLATILE 0x0001e00f
+#define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE)
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
@@ -337,6 +348,8 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_HAVE_MMU_RWLOCK
+
struct kvm_mmu_page;
/*
@@ -358,8 +371,6 @@ struct kvm_mmu {
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
- void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte);
hpa_t root_hpa;
gpa_t root_pgd;
union kvm_mmu_role mmu_role;
@@ -510,6 +521,7 @@ struct kvm_vcpu_hv_synic {
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
+ struct kvm_vcpu *vcpu;
u32 vp_index;
u64 hv_vapic;
s64 runtime_offset;
@@ -520,6 +532,21 @@ struct kvm_vcpu_hv {
cpumask_t tlb_flush;
};
+/* Xen HVM per vcpu emulation context */
+struct kvm_vcpu_xen {
+ u64 hypercall_rip;
+ u32 current_runstate;
+ bool vcpu_info_set;
+ bool vcpu_time_info_set;
+ bool runstate_set;
+ struct gfn_to_hva_cache vcpu_info_cache;
+ struct gfn_to_hva_cache vcpu_time_info_cache;
+ struct gfn_to_hva_cache runstate_cache;
+ u64 last_steal;
+ u64 runstate_entry_time;
+ u64 runstate_times[4];
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -640,7 +667,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- unsigned long cr3_lm_rsvd_bits;
+ u64 reserved_gpa_bits;
int maxphyaddr;
int max_tdp_level;
@@ -717,7 +744,9 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
- struct kvm_vcpu_hv hyperv;
+ bool hyperv_enabled;
+ struct kvm_vcpu_hv *hyperv;
+ struct kvm_vcpu_xen xen;
cpumask_var_t wbinvd_dirty_mask;
@@ -855,12 +884,29 @@ struct kvm_hv_syndbg {
u64 options;
};
+/* Current state of Hyper-V TSC page clocksource */
+enum hv_tsc_page_status {
+ /* TSC page was not set up or disabled */
+ HV_TSC_PAGE_UNSET = 0,
+ /* TSC page MSR was written by the guest, update pending */
+ HV_TSC_PAGE_GUEST_CHANGED,
+ /* TSC page MSR was written by KVM userspace, update pending */
+ HV_TSC_PAGE_HOST_CHANGED,
+ /* TSC page was properly set up and is currently active */
+ HV_TSC_PAGE_SET,
+ /* TSC page is currently being updated and therefore is inactive */
+ HV_TSC_PAGE_UPDATING,
+ /* TSC page was set up with an inaccessible GPA */
+ HV_TSC_PAGE_BROKEN,
+};
+
/* Hyper-V emulation context */
struct kvm_hv {
struct mutex hv_lock;
u64 hv_guest_os_id;
u64 hv_hypercall;
u64 hv_tsc_page;
+ enum hv_tsc_page_status hv_tsc_page_status;
/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
@@ -888,12 +934,26 @@ struct msr_bitmap_range {
unsigned long *bitmap;
};
+/* Xen emulation context */
+struct kvm_xen {
+ bool long_mode;
+ bool shinfo_set;
+ u8 upcall_vector;
+ struct gfn_to_hva_cache shinfo_cache;
+};
+
enum kvm_irqchip_mode {
KVM_IRQCHIP_NONE,
KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
+struct kvm_x86_msr_filter {
+ u8 count;
+ bool default_allow:1;
+ struct msr_bitmap_range ranges[16];
+};
+
#define APICV_INHIBIT_REASON_DISABLE 0
#define APICV_INHIBIT_REASON_HYPERV 1
#define APICV_INHIBIT_REASON_NESTED 2
@@ -908,9 +968,6 @@ struct kvm_arch {
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
- /*
- * Hash table of struct kvm_mmu_page.
- */
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
struct list_head lpage_disallowed_mmu_pages;
@@ -929,7 +986,7 @@ struct kvm_arch {
struct kvm_pit *vpit;
atomic_t vapics_in_nmi_mode;
struct mutex apic_map_lock;
- struct kvm_apic_map *apic_map;
+ struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
bool apic_access_page_done;
@@ -967,6 +1024,7 @@ struct kvm_arch {
struct hlist_head mask_notifier_list;
struct kvm_hv hyperv;
+ struct kvm_xen xen;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
@@ -977,6 +1035,7 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
+ int cpu_dirty_logging_count;
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
@@ -989,18 +1048,16 @@ struct kvm_arch {
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+ bool bus_lock_detection_enabled;
+
/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
u32 user_space_msr_mask;
+ struct kvm_x86_msr_filter __rcu *msr_filter;
- struct {
- u8 count;
- bool default_allow:1;
- struct msr_bitmap_range ranges[16];
- } msr_filter;
-
- struct kvm_pmu_event_filter *pmu_event_filter;
+ struct kvm_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+#ifdef CONFIG_X86_64
/*
* Whether the TDP MMU is enabled for this VM. This contains a
* snapshot of the TDP MMU module parameter from when the VM was
@@ -1026,12 +1083,25 @@ struct kvm_arch {
* tdp_mmu_page set and a root_count of 0.
*/
struct list_head tdp_mmu_pages;
+
+ /*
+ * Protects accesses to the following fields when the MMU lock
+ * is held in read mode:
+ * - tdp_mmu_pages (above)
+ * - the link field of struct kvm_mmu_pages used by the TDP MMU
+ * - lpage_disallowed_mmu_pages
+ * - the lpage_disallowed_link field of struct kvm_mmu_pages used
+ * by the TDP MMU
+ * It is acceptable, but not necessary, to acquire this lock when
+ * the thread holds the MMU lock in write mode.
+ */
+ spinlock_t tdp_mmu_pages_lock;
+#endif /* CONFIG_X86_64 */
};
struct kvm_vm_stat {
ulong mmu_shadow_zapped;
ulong mmu_pte_write;
- ulong mmu_pte_updated;
ulong mmu_pde_zapped;
ulong mmu_flooded;
ulong mmu_recycled;
@@ -1225,30 +1295,11 @@ struct kvm_x86_ops {
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
/*
- * Arch-specific dirty logging hooks. These hooks are only supposed to
- * be valid if the specific arch has hardware-accelerated dirty logging
- * mechanism. Currently only for PML on VMX.
- *
- * - slot_enable_log_dirty:
- * called when enabling log dirty mode for the slot.
- * - slot_disable_log_dirty:
- * called when disabling log dirty mode for the slot.
- * also called when slot is created with log dirty disabled.
- * - flush_log_dirty:
- * called before reporting dirty_bitmap to userspace.
- * - enable_log_dirty_pt_masked:
- * called when reenabling log dirty for the GFNs in the mask after
- * corresponding bits are cleared in slot->dirty_bitmap.
+ * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero
+ * value indicates CPU dirty logging is unsupported or disabled.
*/
- void (*slot_enable_log_dirty)(struct kvm *kvm,
- struct kvm_memory_slot *slot);
- void (*slot_disable_log_dirty)(struct kvm *kvm,
- struct kvm_memory_slot *slot);
- void (*flush_log_dirty)(struct kvm *kvm);
- void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t offset, unsigned long mask);
- int (*cpu_dirty_log_size)(void);
+ int cpu_dirty_log_size;
+ void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
/* pmu operations of sub-arch */
const struct kvm_pmu_ops *pmu_ops;
@@ -1340,6 +1391,19 @@ extern u64 __read_mostly host_efer;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern struct kvm_x86_ops kvm_x86_ops;
+#define KVM_X86_OP(func) \
+ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+
+static inline void kvm_ops_static_call_update(void)
+{
+#define KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+}
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1351,7 +1415,7 @@ void kvm_arch_free_vm(struct kvm *kvm);
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
if (kvm_x86_ops.tlb_remote_flush &&
- !kvm_x86_ops.tlb_remote_flush(kvm))
+ !static_call(kvm_x86_tlb_remote_flush)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1378,11 +1442,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
-void kvm_mmu_slot_set_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
-void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1421,6 +1480,8 @@ extern u8 kvm_tsc_scaling_ratio_frac_bits;
extern u64 kvm_max_tsc_scaling_ratio;
/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
extern u64 kvm_default_tsc_scaling_ratio;
+/* bus lock detection supported? */
+extern bool kvm_has_bus_lock_exit;
extern u64 kvm_mce_cap_supported;
@@ -1501,7 +1562,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -1552,7 +1613,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
@@ -1742,14 +1802,12 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_blocking)
- kvm_x86_ops.vcpu_blocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_blocking)(vcpu);
}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops.vcpu_unblocking)
- kvm_x86_ops.vcpu_unblocking(vcpu);
+ static_call_cond(kvm_x86_vcpu_unblocking)(vcpu);
}
static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 56cdeaac76a0..ddfb3cad8dff 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -289,28 +289,6 @@ extern void (*mce_threshold_vector)(void);
extern void (*deferred_error_int_vector)(void);
/*
- * Thermal handler
- */
-
-void intel_init_thermal(struct cpuinfo_x86 *c);
-
-/* Interrupt Handler for core thermal thresholds */
-extern int (*platform_thermal_notify)(__u64 msr_val);
-
-/* Interrupt Handler for package thermal thresholds */
-extern int (*platform_thermal_package_notify)(__u64 msr_val);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-extern bool (*platform_thermal_package_rate_control)(void);
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-extern void mcheck_intel_therm_init(void);
-#else
-static inline void mcheck_intel_therm_init(void) { }
-#endif
-
-/*
* Used by APEI to report memory error via /dev/mcelog
*/
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2b7cc5397f80..ab45a220fac4 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -127,14 +127,12 @@ static inline unsigned int x86_cpuid_family(void)
}
#ifdef CONFIG_MICROCODE
-int __init microcode_init(void);
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
extern bool initrd_gone;
#else
-static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 30f76b966857..ccf60a809a17 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -78,6 +78,13 @@ extern int hyperv_init_cpuhp;
extern void *hv_hypercall_pg;
extern void __percpu **hyperv_pcpu_input_arg;
+extern void __percpu **hyperv_pcpu_output_arg;
+
+extern u64 hv_current_partition_id;
+
+int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
+int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
+int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
@@ -239,6 +246,8 @@ int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
+extern bool hv_root_partition;
+
#ifdef CONFIG_X86_64
void hv_apic_init(void);
void __init hv_init_spinlocks(void);
@@ -250,10 +259,16 @@ static inline void hv_apic_init(void) {}
static inline void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry,
struct msi_desc *msi_desc)
{
- msi_entry->address = msi_desc->msg.address_lo;
- msi_entry->data = msi_desc->msg.data;
+ msi_entry->address.as_uint32 = msi_desc->msg.address_lo;
+ msi_entry->data.as_uint32 = msi_desc->msg.data;
}
+struct irq_domain *hv_create_pci_msi_domain(void);
+
+int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
+ struct hv_interrupt_entry *entry);
+int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 9d5d949e662e..1cb9c17a4cb4 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -9,7 +9,6 @@
#ifdef CONFIG_X86_LOCAL_APIC
-extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index fdbffec4cfde..5a2baf28a1dc 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -40,6 +40,8 @@
#define ORC_REG_MAX 15
#ifndef __ASSEMBLY__
+#include <asm/byteorder.h>
+
/*
* This struct is more or less a vastly simplified version of the DWARF Call
* Frame Information standard. It contains only the necessary parts of DWARF
@@ -51,10 +53,18 @@
struct orc_entry {
s16 sp_offset;
s16 bp_offset;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
unsigned type:2;
unsigned end:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned bp_reg:4;
+ unsigned sp_reg:4;
+ unsigned unused:5;
+ unsigned end:1;
+ unsigned type:2;
+#endif
} __packed;
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 645bd1d0ee07..64297eabad63 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -66,7 +66,7 @@
* On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, then that syscall will enter the kernel with a
* non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything executable
+ * We avoid this particular problem by preventing anything
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f8dce11d2bc1..4abf110e2243 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -648,11 +648,6 @@ static inline notrace unsigned long arch_local_save_flags(void)
return PVOP_CALLEE0(unsigned long, irq.save_fl);
}
-static inline notrace void arch_local_irq_restore(unsigned long f)
-{
- PVOP_VCALLEE1(irq.restore_fl, f);
-}
-
static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(irq.irq_disable);
@@ -776,31 +771,6 @@ extern void default_banner(void);
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT_XXL
-/*
- * If swapgs is used while the userspace stack is still current,
- * there's no way to call a pvop. The PV replacement *must* be
- * inlined, or the swapgs instruction must be trapped and emulated.
- */
-#define SWAPGS_UNSAFE_STACK \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs)
-
-/*
- * Note: swapgs is very special, and in practise is either going to be
- * implemented with a single "swapgs" instruction or something very
- * special. Either way, we don't need to save any registers for
- * it.
- */
-#define SWAPGS \
- PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
- )
-
-#define USERGS_SYSRET64 \
- PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);)
-
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b6b02b7c19cc..de87087d3bde 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,20 +156,10 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /*
- * Switch to usermode gs and return to 64-bit usermode using
- * sysret. Only used in 64-bit kernels to return to 64-bit
- * processes. Usermode register state, including %rsp, must
- * already be restored.
- */
- void (*usergs_sysret64)(void);
-
/* Normal iret. Jump to this with the standard iret stack
frame set up. */
void (*iret)(void);
- void (*swapgs)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -178,16 +168,13 @@ struct pv_cpu_ops {
struct pv_irq_ops {
#ifdef CONFIG_PARAVIRT_XXL
/*
- * Get/set interrupt state. save_fl and restore_fl are only
- * expected to use X86_EFLAGS_IF; all other bits
- * returned from save_fl are undefined, and may be ignored by
- * restore_fl.
+ * Get/set interrupt state. save_fl is expected to use X86_EFLAGS_IF;
+ * all other bits returned from save_fl are undefined.
*
* NOTE: These functions callers expect the callee to preserve
* more registers than the standard C calling convention.
*/
struct paravirt_callee_save save_fl;
- struct paravirt_callee_save restore_fl;
struct paravirt_callee_save irq_disable;
struct paravirt_callee_save irq_enable;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index b9a7fd0a27e2..544f41a179fb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -261,8 +261,12 @@ struct x86_pmu_capability {
#define INTEL_PMC_IDX_TD_BAD_SPEC (INTEL_PMC_IDX_METRIC_BASE + 1)
#define INTEL_PMC_IDX_TD_FE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 2)
#define INTEL_PMC_IDX_TD_BE_BOUND (INTEL_PMC_IDX_METRIC_BASE + 3)
-#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_BE_BOUND
-#define INTEL_PMC_MSK_TOPDOWN ((0xfull << INTEL_PMC_IDX_METRIC_BASE) | \
+#define INTEL_PMC_IDX_TD_HEAVY_OPS (INTEL_PMC_IDX_METRIC_BASE + 4)
+#define INTEL_PMC_IDX_TD_BR_MISPREDICT (INTEL_PMC_IDX_METRIC_BASE + 5)
+#define INTEL_PMC_IDX_TD_FETCH_LAT (INTEL_PMC_IDX_METRIC_BASE + 6)
+#define INTEL_PMC_IDX_TD_MEM_BOUND (INTEL_PMC_IDX_METRIC_BASE + 7)
+#define INTEL_PMC_IDX_METRIC_END INTEL_PMC_IDX_TD_MEM_BOUND
+#define INTEL_PMC_MSK_TOPDOWN ((0xffull << INTEL_PMC_IDX_METRIC_BASE) | \
INTEL_PMC_MSK_FIXED_SLOTS)
/*
@@ -280,8 +284,14 @@ struct x86_pmu_capability {
#define INTEL_TD_METRIC_BAD_SPEC 0x8100 /* Bad speculation metric */
#define INTEL_TD_METRIC_FE_BOUND 0x8200 /* FE bound metric */
#define INTEL_TD_METRIC_BE_BOUND 0x8300 /* BE bound metric */
-#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_BE_BOUND
-#define INTEL_TD_METRIC_NUM 4
+/* Level 2 metrics */
+#define INTEL_TD_METRIC_HEAVY_OPS 0x8400 /* Heavy Operations metric */
+#define INTEL_TD_METRIC_BR_MISPREDICT 0x8500 /* Branch Mispredict metric */
+#define INTEL_TD_METRIC_FETCH_LAT 0x8600 /* Fetch Latency metric */
+#define INTEL_TD_METRIC_MEM_BOUND 0x8700 /* Memory bound metric */
+
+#define INTEL_TD_METRIC_MAX INTEL_TD_METRIC_MEM_BOUND
+#define INTEL_TD_METRIC_NUM 8
static inline bool is_metric_idx(int idx)
{
@@ -483,11 +493,7 @@ static inline void perf_check_microcode(void) { }
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- *nr = 0;
- return NULL;
-}
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
return -1;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 394757ee030a..f24d7ef8fffa 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -177,8 +177,6 @@ enum page_cache_mode {
#define __pgprot(x) ((pgprot_t) { (x) } )
#define __pg(x) __pgprot(x)
-#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
-
#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 16b9f220bdeb..40f92270515b 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -10,8 +10,6 @@
#ifndef _PLATFORM_SST_AUDIO_H_
#define _PLATFORM_SST_AUDIO_H_
-#include <linux/sfi.h>
-
#define MAX_NUM_STREAMS_MRFLD 25
#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 69485ca13665..f8cb8af4de5c 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
#include <linux/thread_info.h>
+#include <linux/static_call_types.h>
DECLARE_PER_CPU(int, __preempt_count);
@@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset)
}
#ifdef CONFIG_PREEMPTION
- extern asmlinkage void preempt_schedule_thunk(void);
-# define __preempt_schedule() \
- asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT)
- extern asmlinkage void preempt_schedule(void);
- extern asmlinkage void preempt_schedule_notrace_thunk(void);
-# define __preempt_schedule_notrace() \
- asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT)
+extern asmlinkage void preempt_schedule(void);
+extern asmlinkage void preempt_schedule_thunk(void);
- extern asmlinkage void preempt_schedule_notrace(void);
-#endif
+#define __preempt_schedule_func preempt_schedule_thunk
+
+extern asmlinkage void preempt_schedule_notrace(void);
+extern asmlinkage void preempt_schedule_notrace_thunk(void);
+
+#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+
+#define __preempt_schedule() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
+
+#define __preempt_schedule_notrace() \
+do { \
+ __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \
+ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \
+} while (0)
+
+#else /* PREEMPT_DYNAMIC */
+
+#define __preempt_schedule() \
+ asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT);
+
+#define __preempt_schedule_notrace() \
+ asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT);
+
+#endif /* PREEMPT_DYNAMIC */
+
+#endif /* PREEMPTION */
#endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..f1b9ed5efaa9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -426,8 +426,6 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
@@ -454,7 +452,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
-DECLARE_PER_CPU(unsigned int, irq_count);
+DECLARE_PER_CPU(void *, hardirq_stack_ptr);
+DECLARE_PER_CPU(bool, hardirq_stack_inuse);
extern asmlinkage void ignore_sysret(void);
/* Save actual FS/GS selectors and bases to current->thread */
@@ -473,9 +472,9 @@ struct stack_canary {
};
DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
-/* Per CPU softirq stack pointer */
+DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
-#endif /* X86_64 */
+#endif /* !X86_64 */
extern unsigned int fpu_kernel_xstate_size;
extern unsigned int fpu_user_xstate_size;
@@ -552,15 +551,6 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
*size = fpu_kernel_xstate_size;
}
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
-
static inline void
native_load_sp0(unsigned long sp0)
{
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 2c35f1c01a2d..b6a9d51d1d79 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -25,6 +25,7 @@ void __end_SYSENTER_singlestep_region(void);
void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
+void entry_SYSCALL_compat_safe_stack(void);
void entry_INT80_compat(void);
#ifdef CONFIG_XEN_PV
void xen_entry_INT80_compat(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index d8324a236696..409f661481e1 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -94,6 +94,8 @@ struct pt_regs {
#include <asm/paravirt_types.h>
#endif
+#include <asm/proto.h>
+
struct cpuinfo_x86;
struct task_struct;
@@ -175,6 +177,19 @@ static inline bool any_64bit_mode(struct pt_regs *regs)
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
+
+static inline bool ip_within_syscall_gap(struct pt_regs *regs)
+{
+ bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack);
+
+#ifdef CONFIG_IA32_EMULATION
+ ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat &&
+ regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack);
+#endif
+
+ return ret;
+}
#endif
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 3ff0d48469f2..b2d504f11937 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -101,6 +101,7 @@
#define REQUIRED_MASK16 0
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
+#define REQUIRED_MASK19 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 07603064df8f..d60ed0668a59 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -56,19 +56,22 @@ static void __resctrl_sched_in(void)
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
u32 rmid = state->default_rmid;
+ u32 tmp;
/*
* If this task has a closid/rmid assigned, use it.
* Else use the closid/rmid assigned to this cpu.
*/
if (static_branch_likely(&rdt_alloc_enable_key)) {
- if (current->closid)
- closid = current->closid;
+ tmp = READ_ONCE(current->closid);
+ if (tmp)
+ closid = tmp;
}
if (static_branch_likely(&rdt_mon_enable_key)) {
- if (current->rmid)
- rmid = current->rmid;
+ tmp = READ_ONCE(current->rmid);
+ if (tmp)
+ rmid = tmp;
}
if (closid != state->cur_closid || rmid != state->cur_rmid) {
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8b58d6975d5d..0bc9b0895f33 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -58,9 +58,8 @@ static __always_inline unsigned long smap_save(void)
unsigned long flags;
asm volatile ("# smap_save\n\t"
- ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
- "pushf; pop %0; " __ASM_CLAC "\n\t"
- "1:"
+ ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t",
+ X86_FEATURE_SMAP)
: "=rm" (flags) : : "memory", "cc");
return flags;
@@ -69,9 +68,8 @@ static __always_inline unsigned long smap_save(void)
static __always_inline void smap_restore(unsigned long flags)
{
asm volatile ("# smap_restore\n\t"
- ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP)
- "push %0; popf\n\t"
- "1:"
+ ALTERNATIVE("", "push %0; popf\n\t",
+ X86_FEATURE_SMAP)
: : "g" (flags) : "memory", "cc");
}
diff --git a/arch/x86/include/asm/softirq_stack.h b/arch/x86/include/asm/softirq_stack.h
new file mode 100644
index 000000000000..889d53d6a0e1
--- /dev/null
+++ b/arch/x86/include/asm/softirq_stack.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SOFTIRQ_STACK_H
+#define _ASM_X86_SOFTIRQ_STACK_H
+
+#ifdef CONFIG_X86_64
+# include <asm/irq_stack.h>
+#else
+# include <asm-generic/softirq_stack.h>
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index cc177b4431ae..1d3cbaef4bb7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -243,10 +243,10 @@ static inline void serialize(void)
}
/* The dst parameter must be 64-bytes aligned */
-static inline void movdir64b(void *dst, const void *src)
+static inline void movdir64b(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
/*
* MOVDIR64B %(rdx), rax.
@@ -286,7 +286,7 @@ static inline void movdir64b(void *dst, const void *src)
static inline int enqcmds(void __iomem *dst, const void *src)
{
const struct { char _[64]; } *__src = src;
- struct { char _[64]; } *__dst = dst;
+ struct { char _[64]; } __iomem *__dst = dst;
int zf;
/*
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index c37f11999d0c..cbb67b6030f9 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -37,4 +37,11 @@
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")
+
+#define ARCH_ADD_TRAMP_KEY(name) \
+ asm(".pushsection .static_call_tramp_key, \"a\" \n" \
+ ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \
+ ".long " STATIC_CALL_KEY_STR(name) " - . \n" \
+ ".popsection \n")
+
#endif /* _ASM_STATIC_CALL_H */
diff --git a/arch/x86/include/asm/thermal.h b/arch/x86/include/asm/thermal.h
new file mode 100644
index 000000000000..ddbdefd5b94f
--- /dev/null
+++ b/arch/x86/include/asm/thermal.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_THERMAL_H
+#define _ASM_X86_THERMAL_H
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+void intel_init_thermal(struct cpuinfo_x86 *c);
+bool x86_thermal_enabled(void);
+void intel_thermal_interrupt(void);
+#else
+static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
+#endif
+
+#endif /* _ASM_X86_THERMAL_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..06b740bae431 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -205,10 +205,23 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+
+#ifndef __ASSEMBLY__
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
+
+#define arch_set_restart_data(restart) \
+ do { restart->arch_data = current_thread_info()->status; } while (0)
+
#endif
-#ifndef __ASSEMBLY__
#ifdef CONFIG_X86_32
#define in_ia32_syscall() true
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 820082bd6880..1bfe979bb9bc 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,6 @@
#define tlb_start_vma(tlb, vma) do { } while (0)
#define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
#define tlb_flush tlb_flush
static inline void tlb_flush(struct mmu_gather *tlb);
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index 664d4610d700..8e574c0afef8 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -48,17 +48,8 @@
UNWIND_HINT_REGS base=\base offset=\offset partial=1
.endm
-.macro UNWIND_HINT_FUNC sp_offset=8
- UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=\sp_offset type=UNWIND_HINT_TYPE_CALL
-.endm
-
-/*
- * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN
- * and sibling calls. On these, sp_offset denotes the expected offset from
- * initial_func_cfi.
- */
-.macro UNWIND_HINT_RET_OFFSET sp_offset=8
- UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset
+.macro UNWIND_HINT_FUNC
+ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 9aad0e0876fb..8757078d4442 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -30,16 +30,29 @@ static inline int cpu_has_vmx(void)
}
-/** Disable VMX on the current CPU
+/**
+ * cpu_vmxoff() - Disable VMX on the current CPU
*
- * vmxoff causes a undefined-opcode exception if vmxon was not run
- * on the CPU previously. Only call this function if you know VMX
- * is enabled.
+ * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
+ *
+ * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
+ * atomically track post-VMXON state, e.g. this may be called in NMI context.
+ * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
+ * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
+ * magically in RM, VM86, compat mode, or at CPL>0.
*/
-static inline void cpu_vmxoff(void)
+static inline int cpu_vmxoff(void)
{
- asm volatile ("vmxoff");
+ asm_volatile_goto("1: vmxoff\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "cc", "memory" : fault);
+
+ cr4_clear_bits(X86_CR4_VMXE);
+ return 0;
+
+fault:
cr4_clear_bits(X86_CR4_VMXE);
+ return -EIO;
}
static inline int cpu_vmx_enabled(void)
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 26efbec94448..9e8ac5073ecb 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -36,7 +36,6 @@ struct vm86 {
unsigned long saved_sp0;
unsigned long flags;
- unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 38ca445a8429..358707f60d99 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -73,6 +73,7 @@
#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+#define SECONDARY_EXEC_BUS_LOCK_DETECTION VMCS_CONTROL_BIT(BUS_LOCK_DETECTION)
#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index 9915990fd8cf..d9a74681a77d 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -83,5 +83,6 @@
#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */
#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
+#define VMX_FEATURE_BUS_LOCK_DETECTION ( 2*32+ 30) /* "" VM-Exit when bus lock caused */
#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 9139b3e86316..baca0b00ef76 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -182,6 +182,9 @@ struct arch_shared_info {
unsigned long p2m_cr3; /* cr3 value of the p2m address space */
unsigned long p2m_vaddr; /* virtual address of the p2m list */
unsigned long p2m_generation; /* generation count of p2m mapping */
+#ifdef CONFIG_X86_32
+ uint32_t wc_sec_hi;
+#endif
};
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a162e559753..7068e4bb057d 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -87,6 +87,18 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
#endif
/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define XEN_EXTRA_MEM_RATIO (10)
+
+/*
* Helper functions to write or read unsigned long values to/from
* memory, when the access may fault.
*/
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 8e76d3701db3..5a3022c8af82 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -112,6 +112,7 @@ struct kvm_ioapic_state {
#define KVM_NR_IRQCHIPS 3
#define KVM_RUN_X86_SMM (1 << 0)
+#define KVM_RUN_X86_BUS_LOCK (1 << 1)
/* for KVM_GET_REGS and KVM_SET_REGS */
struct kvm_regs {
diff --git a/arch/x86/include/uapi/asm/vm86.h b/arch/x86/include/uapi/asm/vm86.h
index d2ee4e307ef8..18909b8050bc 100644
--- a/arch/x86/include/uapi/asm/vm86.h
+++ b/arch/x86/include/uapi/asm/vm86.h
@@ -97,7 +97,7 @@ struct revectored_struct {
struct vm86_struct {
struct vm86_regs regs;
unsigned long flags;
- unsigned long screen_bitmap;
+ unsigned long screen_bitmap; /* unused, preserved by vm86() */
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
@@ -106,7 +106,7 @@ struct vm86_struct {
/*
* flags masks
*/
-#define VM86_SCREEN_BITMAP 0x0001
+#define VM86_SCREEN_BITMAP 0x0001 /* no longer supported */
struct vm86plus_info_struct {
unsigned long force_return_for_pic:1;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index ada955c5ebb6..b8e650a985e3 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -89,6 +89,7 @@
#define EXIT_REASON_XRSTORS 64
#define EXIT_REASON_UMWAIT 67
#define EXIT_REASON_TPAUSE 68
+#define EXIT_REASON_BUS_LOCK 74
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -150,7 +151,8 @@
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }, \
{ EXIT_REASON_UMWAIT, "UMWAIT" }, \
- { EXIT_REASON_TPAUSE, "TPAUSE" }
+ { EXIT_REASON_TPAUSE, "TPAUSE" }, \
+ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }
#define VMX_EXIT_REASON_FLAGS \
{ VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" }
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5eeb808eb024..2ddf08351f0b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -116,7 +116,6 @@ obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
-obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index f1bb57b0e41e..cf340d85946a 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5d3a0b8fd379..56b6865afb2a 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
.text
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
#include <asm/frame.h>
+#include <asm/nospec-branch.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz
@@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64)
movq saved_rbp, %rbp
movq saved_rip, %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
FRAME_END
jmp restore_processor_state
SYM_FUNC_END(do_suspend_lowlevel)
+STACK_FRAME_NON_STANDARD do_suspend_lowlevel
.data
saved_rbp: .quad 0
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
deleted file mode 100644
index 263eeaddb0aa..000000000000
--- a/arch/x86/kernel/apb_timer.c
+++ /dev/null
@@ -1,347 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * apb_timer.c: Driver for Langwell APB timers
- *
- * (C) Copyright 2009 Intel Corporation
- * Author: Jacob Pan (jacob.jun.pan@intel.com)
- *
- * Note:
- * Langwell is the south complex of Intel Moorestown MID platform. There are
- * eight external timers in total that can be used by the operating system.
- * The timer information, such as frequency and addresses, is provided to the
- * OS via SFI tables.
- * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
- * individual redirection table entries (RTE).
- * Unlike HPET, there is no master counter, therefore one of the timers are
- * used as clocksource. The overall allocation looks like:
- * - timer 0 - NR_CPUs for per cpu timer
- * - one timer for clocksource
- * - one timer for watchdog driver.
- * It is also worth notice that APB timer does not support true one-shot mode,
- * free-running mode will be used here to emulate one-shot mode.
- * APB timer can also be used as broadcast timer along with per cpu local APIC
- * timer, but by default APB timer has higher rating than local APIC timers.
- */
-
-#include <linux/delay.h>
-#include <linux/dw_apb_timer.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/pm.h>
-#include <linux/sfi.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/irq.h>
-
-#include <asm/fixmap.h>
-#include <asm/apb_timer.h>
-#include <asm/intel-mid.h>
-#include <asm/time.h>
-
-#define APBT_CLOCKEVENT_RATING 110
-#define APBT_CLOCKSOURCE_RATING 250
-
-#define APBT_CLOCKEVENT0_NUM (0)
-#define APBT_CLOCKSOURCE_NUM (2)
-
-static phys_addr_t apbt_address;
-static int apb_timer_block_enabled;
-static void __iomem *apbt_virt_address;
-
-/*
- * Common DW APB timer info
- */
-static unsigned long apbt_freq;
-
-struct apbt_dev {
- struct dw_apb_clock_event_device *timer;
- unsigned int num;
- int cpu;
- unsigned int irq;
- char name[10];
-};
-
-static struct dw_apb_clocksource *clocksource_apbt;
-
-static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
-{
- return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
-}
-
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
-
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-#endif
-
-static inline void apbt_set_mapping(void)
-{
- struct sfi_timer_table_entry *mtmr;
- int phy_cs_timer_id = 0;
-
- if (apbt_virt_address) {
- pr_debug("APBT base already mapped\n");
- return;
- }
- mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
- if (mtmr == NULL) {
- printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
- APBT_CLOCKEVENT0_NUM);
- return;
- }
- apbt_address = (phys_addr_t)mtmr->phys_addr;
- if (!apbt_address) {
- printk(KERN_WARNING "No timer base from SFI, use default\n");
- apbt_address = APBT_DEFAULT_BASE;
- }
- apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE);
- if (!apbt_virt_address) {
- pr_debug("Failed mapping APBT phy address at %lu\n",\
- (unsigned long)apbt_address);
- goto panic_noapbt;
- }
- apbt_freq = mtmr->freq_hz;
- sfi_free_mtmr(mtmr);
-
- /* Now figure out the physical timer id for clocksource device */
- mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
- if (mtmr == NULL)
- goto panic_noapbt;
-
- /* Now figure out the physical timer id */
- pr_debug("Use timer %d for clocksource\n",
- (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
- phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
- APBTMRS_REG_SIZE;
-
- clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
- "apbt0", apbt_virt_address + phy_cs_timer_id *
- APBTMRS_REG_SIZE, apbt_freq);
- return;
-
-panic_noapbt:
- panic("Failed to setup APB system timer\n");
-
-}
-
-static inline void apbt_clear_mapping(void)
-{
- iounmap(apbt_virt_address);
- apbt_virt_address = NULL;
-}
-
-static int __init apbt_clockevent_register(void)
-{
- struct sfi_timer_table_entry *mtmr;
- struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
-
- mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
- if (mtmr == NULL) {
- printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
- APBT_CLOCKEVENT0_NUM);
- return -ENODEV;
- }
-
- adev->num = smp_processor_id();
- adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
- intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
- APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
- adev_virt_addr(adev), 0, apbt_freq);
- /* Firmware does EOI handling for us. */
- adev->timer->eoi = NULL;
-
- if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
- global_clock_event = &adev->timer->ced;
- printk(KERN_DEBUG "%s clockevent registered as global\n",
- global_clock_event->name);
- }
-
- dw_apb_clockevent_register(adev->timer);
-
- sfi_free_mtmr(mtmr);
- return 0;
-}
-
-#ifdef CONFIG_SMP
-
-static void apbt_setup_irq(struct apbt_dev *adev)
-{
- irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
-}
-
-/* Should be called with per cpu */
-void apbt_setup_secondary_clock(void)
-{
- struct apbt_dev *adev;
- int cpu;
-
- /* Don't register boot CPU clockevent */
- cpu = smp_processor_id();
- if (!cpu)
- return;
-
- adev = this_cpu_ptr(&cpu_apbt_dev);
- if (!adev->timer) {
- adev->timer = dw_apb_clockevent_init(cpu, adev->name,
- APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
- adev->irq, apbt_freq);
- adev->timer->eoi = NULL;
- } else {
- dw_apb_clockevent_resume(adev->timer);
- }
-
- printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
- cpu, adev->name, adev->cpu);
-
- apbt_setup_irq(adev);
- dw_apb_clockevent_register(adev->timer);
-
- return;
-}
-
-/*
- * this notify handler process CPU hotplug events. in case of S0i3, nonboot
- * cpus are disabled/enabled frequently, for performance reasons, we keep the
- * per cpu timer irq registered so that we do need to do free_irq/request_irq.
- *
- * TODO: it might be more reliable to directly disable percpu clockevent device
- * without the notifier chain. currently, cpu 0 may get interrupts from other
- * cpu timers during the offline process due to the ordering of notification.
- * the extra interrupt is harmless.
- */
-static int apbt_cpu_dead(unsigned int cpu)
-{
- struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
-
- dw_apb_clockevent_pause(adev->timer);
- if (system_state == SYSTEM_RUNNING) {
- pr_debug("skipping APBT CPU %u offline\n", cpu);
- } else {
- pr_debug("APBT clockevent for cpu %u offline\n", cpu);
- dw_apb_clockevent_stop(adev->timer);
- }
- return 0;
-}
-
-static __init int apbt_late_init(void)
-{
- if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
- !apb_timer_block_enabled)
- return 0;
- return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
- apbt_cpu_dead);
-}
-fs_initcall(apbt_late_init);
-#else
-
-void apbt_setup_secondary_clock(void) {}
-
-#endif /* CONFIG_SMP */
-
-static int apbt_clocksource_register(void)
-{
- u64 start, now;
- u64 t1;
-
- /* Start the counter, use timer 2 as source, timer 0/1 for event */
- dw_apb_clocksource_start(clocksource_apbt);
-
- /* Verify whether apbt counter works */
- t1 = dw_apb_clocksource_read(clocksource_apbt);
- start = rdtsc();
-
- /*
- * We don't know the TSC frequency yet, but waiting for
- * 200000 TSC cycles is safe:
- * 4 GHz == 50us
- * 1 GHz == 200us
- */
- do {
- rep_nop();
- now = rdtsc();
- } while ((now - start) < 200000UL);
-
- /* APBT is the only always on clocksource, it has to work! */
- if (t1 == dw_apb_clocksource_read(clocksource_apbt))
- panic("APBT counter not counting. APBT disabled\n");
-
- dw_apb_clocksource_register(clocksource_apbt);
-
- return 0;
-}
-
-/*
- * Early setup the APBT timer, only use timer 0 for booting then switch to
- * per CPU timer if possible.
- * returns 1 if per cpu apbt is setup
- * returns 0 if no per cpu apbt is chosen
- * panic if set up failed, this is the only platform timer on Moorestown.
- */
-void __init apbt_time_init(void)
-{
-#ifdef CONFIG_SMP
- int i;
- struct sfi_timer_table_entry *p_mtmr;
- struct apbt_dev *adev;
-#endif
-
- if (apb_timer_block_enabled)
- return;
- apbt_set_mapping();
- if (!apbt_virt_address)
- goto out_noapbt;
- /*
- * Read the frequency and check for a sane value, for ESL model
- * we extend the possible clock range to allow time scaling.
- */
-
- if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
- pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
- goto out_noapbt;
- }
- if (apbt_clocksource_register()) {
- pr_debug("APBT has failed to register clocksource\n");
- goto out_noapbt;
- }
- if (!apbt_clockevent_register())
- apb_timer_block_enabled = 1;
- else {
- pr_debug("APBT has failed to register clockevent\n");
- goto out_noapbt;
- }
-#ifdef CONFIG_SMP
- /* kernel cmdline disable apb timer, so we will use lapic timers */
- if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
- printk(KERN_INFO "apbt: disabled per cpu timer\n");
- return;
- }
- pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
- if (num_possible_cpus() <= sfi_mtimer_num)
- apbt_num_timers_used = num_possible_cpus();
- else
- apbt_num_timers_used = 1;
- pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
-
- /* here we set up per CPU timer data structure */
- for (i = 0; i < apbt_num_timers_used; i++) {
- adev = &per_cpu(cpu_apbt_dev, i);
- adev->num = i;
- adev->cpu = i;
- p_mtmr = sfi_get_mtmr(i);
- if (p_mtmr)
- adev->irq = p_mtmr->irq;
- else
- printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
- snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
- }
-#endif
-
- return;
-
-out_noapbt:
- apbt_clear_mapping();
- apb_timer_block_enabled = 0;
- panic("failed to enable APB timer\n");
-}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7f4c081f59f0..4f26700f314d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1747,6 +1747,7 @@ void apic_ap_setup(void)
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
+EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
@@ -2137,18 +2138,11 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/
-/**
- * spurious_interrupt - Catch all for interrupts raised on unused vectors
- * @regs: Pointer to pt_regs on stack
- * @vector: The vector number
- *
- * This is invoked from ASM entry code to catch all interrupts which
- * trigger on an entry which is routed to the common_spurious idtentry
- * point.
- *
- * Also called from sysvec_spurious_apic_interrupt().
+/*
+ * Common handling code for spurious_interrupt and spurious_vector entry
+ * points below. No point in allowing the compiler to inline it twice.
*/
-DEFINE_IDTENTRY_IRQ(spurious_interrupt)
+static noinline void handle_spurious_interrupt(u8 vector)
{
u32 v;
@@ -2183,9 +2177,23 @@ out:
trace_spurious_apic_exit(vector);
}
+/**
+ * spurious_interrupt - Catch all for interrupts raised on unused vectors
+ * @regs: Pointer to pt_regs on stack
+ * @vector: The vector number
+ *
+ * This is invoked from ASM entry code to catch all interrupts which
+ * trigger on an entry which is routed to the common_spurious idtentry
+ * point.
+ */
+DEFINE_IDTENTRY_IRQ(spurious_interrupt)
+{
+ handle_spurious_interrupt(vector);
+}
+
DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
{
- __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
+ handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*
@@ -2334,6 +2342,11 @@ static int cpuid_to_apicid[] = {
[0 ... NR_CPUS - 1] = -1,
};
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == cpuid_to_apicid[cpu];
+}
+
#ifdef CONFIG_SMP
/**
* apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e4ab4804b20d..73ff4dd426a8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -198,7 +198,7 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+/* Will be called in mpparse/ACPI codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
{
int i;
@@ -1032,6 +1032,16 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
legacy = mp_is_legacy_irq(irq);
+ /*
+ * IRQ2 is unusable for historical reasons on systems which
+ * have a legacy PIC. See the comment vs. IRQ2 further down.
+ *
+ * If this gets removed at some point then the related code
+ * in lapic_assign_system_vectors() needs to be adjusted as
+ * well.
+ */
+ if (legacy && irq == PIC_CASCADE_IR)
+ return -EINVAL;
}
mutex_lock(&ioapic_mutex);
@@ -2863,7 +2873,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
/*
* If mp_register_ioapic() is called during early boot stage when
- * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+ * walking ACPI/DT tables, it's too early to create irqdomain,
* we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
if (hotplug) {
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 828be792231e..b14533af7676 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
- OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
- cpu.usergs_sysret64);
- OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 0b2c03943ac6..23f5f27b5a02 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -10,6 +10,8 @@
*/
#include <linux/interrupt.h>
+
+#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/cpufeatures.h>
#include <asm/desc.h>
@@ -19,7 +21,7 @@
static u32 __init acrn_detect(void)
{
- return hypervisor_cpuid_base("ACRNACRNACRN", 0);
+ return acrn_cpuid_base();
}
static void __init acrn_init_platform(void)
@@ -55,6 +57,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
set_irq_regs(old_regs);
}
+void acrn_setup_intr_handler(void (*handler)(void))
+{
+ acrn_intr_handler = handler;
+}
+EXPORT_SYMBOL_GPL(acrn_setup_intr_handler);
+
+void acrn_remove_intr_handler(void)
+{
+ acrn_intr_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(acrn_remove_intr_handler);
+
const __initconst struct hypervisor_x86 x86_hyper_acrn = {
.name = "ACRN",
.detect = acrn_detect,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..ab640abe26b6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1739,8 +1742,8 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+DEFINE_PER_CPU(void *, hardirq_stack_ptr);
+DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816fdbec795a..0e422a544835 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,6 +24,7 @@
#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
+#include <asm/thermal.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
index 9f020c994154..015856abdbb1 100644
--- a/arch/x86/kernel/cpu/mce/Makefile
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
mce-inject-y := inject.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index e133ce1e562b..7962355436da 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -878,6 +878,12 @@ static atomic_t mce_executing;
static atomic_t mce_callin;
/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
* Check if a timeout waiting for other CPUs happened.
*/
static int mce_timed_out(u64 *t, const char *msg)
@@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- if (mca_cfg.tolerant <= 1)
+ if (mca_cfg.tolerant <= 1) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
+ }
cpu_missing = 1;
return 1;
}
@@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out)
* is updated before mce_callin.
*/
order = atomic_inc_return(&mce_callin);
+ cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/*
* Wait for everyone.
@@ -1114,6 +1125,7 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
barrier();
/*
@@ -2178,7 +2190,6 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
@@ -2713,6 +2724,7 @@ static void mce_reset(void)
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
}
static int fake_panic_get(void *data, u64 *val)
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index c2476fe0682e..e309476743b7 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
- intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
deleted file mode 100644
index a7cd2d203ced..000000000000
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ /dev/null
@@ -1,739 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/traps.h>
-#include <asm/apic.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-#include "internal.h"
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-#define THERMAL_THROTTLING_EVENT 0
-#define POWER_LIMIT_EVENT 1
-
-/**
- * struct _thermal_state - Represent the current thermal event state
- * @next_check: Stores the next timestamp, when it is allowed
- * to log the next warning message.
- * @last_interrupt_time: Stores the timestamp for the last threshold
- * high event.
- * @therm_work: Delayed workqueue structure
- * @count: Stores the current running count for thermal
- * or power threshold interrupts.
- * @last_count: Stores the previous running count for thermal
- * or power threshold interrupts.
- * @max_time_ms: This shows the maximum amount of time CPU was
- * in throttled state for a single thermal
- * threshold high to low state.
- * @total_time_ms: This is a cumulative time during which CPU was
- * in the throttled state.
- * @rate_control_active: Set when a throttling message is logged.
- * This is used for the purpose of rate-control.
- * @new_event: Stores the last high/low status of the
- * THERM_STATUS_PROCHOT or
- * THERM_STATUS_POWER_LIMIT.
- * @level: Stores whether this _thermal_state instance is
- * for a CORE level or for PACKAGE level.
- * @sample_index: Index for storing the next sample in the buffer
- * temp_samples[].
- * @sample_count: Total number of samples collected in the buffer
- * temp_samples[].
- * @average: The last moving average of temperature samples
- * @baseline_temp: Temperature at which thermal threshold high
- * interrupt was generated.
- * @temp_samples: Storage for temperature samples to calculate
- * moving average.
- *
- * This structure is used to represent data related to thermal state for a CPU.
- * There is a separate storage for core and package level for each CPU.
- */
-struct _thermal_state {
- u64 next_check;
- u64 last_interrupt_time;
- struct delayed_work therm_work;
- unsigned long count;
- unsigned long last_count;
- unsigned long max_time_ms;
- unsigned long total_time_ms;
- bool rate_control_active;
- bool new_event;
- u8 level;
- u8 sample_index;
- u8 sample_count;
- u8 average;
- u8 baseline_temp;
- u8 temp_samples[3];
-};
-
-struct thermal_state {
- struct _thermal_state core_throttle;
- struct _thermal_state core_power_limit;
- struct _thermal_state package_throttle;
- struct _thermal_state package_power_limit;
- struct _thermal_state core_thresh0;
- struct _thermal_state core_thresh1;
- struct _thermal_state pkg_thresh0;
- struct _thermal_state pkg_thresh1;
-};
-
-/* Callback to handle core threshold interrupts */
-int (*platform_thermal_notify)(__u64 msr_val);
-EXPORT_SYMBOL(platform_thermal_notify);
-
-/* Callback to handle core package threshold_interrupts */
-int (*platform_thermal_package_notify)(__u64 msr_val);
-EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-bool (*platform_thermal_package_rate_control)(void);
-EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
-
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_device_one_ro(_name) \
- static DEVICE_ATTR(_name, 0444, \
- therm_throt_device_show_##_name, \
- NULL) \
-
-#define define_therm_throt_device_show_func(event, name) \
- \
-static ssize_t therm_throt_device_show_##event##_##name( \
- struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) { \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).event.name); \
- } else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_device_show_func(core_throttle, count);
-define_therm_throt_device_one_ro(core_throttle_count);
-
-define_therm_throt_device_show_func(core_power_limit, count);
-define_therm_throt_device_one_ro(core_power_limit_count);
-
-define_therm_throt_device_show_func(package_throttle, count);
-define_therm_throt_device_one_ro(package_throttle_count);
-
-define_therm_throt_device_show_func(package_power_limit, count);
-define_therm_throt_device_one_ro(package_power_limit_count);
-
-define_therm_throt_device_show_func(core_throttle, max_time_ms);
-define_therm_throt_device_one_ro(core_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, max_time_ms);
-define_therm_throt_device_one_ro(package_throttle_max_time_ms);
-
-define_therm_throt_device_show_func(core_throttle, total_time_ms);
-define_therm_throt_device_one_ro(core_throttle_total_time_ms);
-
-define_therm_throt_device_show_func(package_throttle, total_time_ms);
-define_therm_throt_device_one_ro(package_throttle_total_time_ms);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &dev_attr_core_throttle_count.attr,
- &dev_attr_core_throttle_max_time_ms.attr,
- &dev_attr_core_throttle_total_time_ms.attr,
- NULL
-};
-
-static const struct attribute_group thermal_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-#define CORE_LEVEL 0
-#define PACKAGE_LEVEL 1
-
-#define THERM_THROT_POLL_INTERVAL HZ
-#define THERM_STATUS_PROCHOT_LOG BIT(1)
-
-#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
-#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
-
-static void clear_therm_status_log(int level)
-{
- int msr;
- u64 mask, msr_val;
-
- if (level == CORE_LEVEL) {
- msr = MSR_IA32_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_CORE_MASK;
- } else {
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
- mask = THERM_STATUS_CLEAR_PKG_MASK;
- }
-
- rdmsrl(msr, msr_val);
- msr_val &= mask;
- wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
-}
-
-static void get_therm_status(int level, bool *proc_hot, u8 *temp)
-{
- int msr;
- u64 msr_val;
-
- if (level == CORE_LEVEL)
- msr = MSR_IA32_THERM_STATUS;
- else
- msr = MSR_IA32_PACKAGE_THERM_STATUS;
-
- rdmsrl(msr, msr_val);
- if (msr_val & THERM_STATUS_PROCHOT_LOG)
- *proc_hot = true;
- else
- *proc_hot = false;
-
- *temp = (msr_val >> 16) & 0x7F;
-}
-
-static void __maybe_unused throttle_active_work(struct work_struct *work)
-{
- struct _thermal_state *state = container_of(to_delayed_work(work),
- struct _thermal_state, therm_work);
- unsigned int i, avg, this_cpu = smp_processor_id();
- u64 now = get_jiffies_64();
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /* temperature value is offset from the max so lesser means hotter */
- if (!hot && temp > state->baseline_temp) {
- if (state->rate_control_active)
- pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
-
- state->rate_control_active = false;
- return;
- }
-
- if (time_before64(now, state->next_check) &&
- state->rate_control_active)
- goto re_arm;
-
- state->next_check = now + CHECK_INTERVAL;
-
- if (state->count != state->last_count) {
- /* There was one new thermal interrupt */
- state->last_count = state->count;
- state->average = 0;
- state->sample_count = 0;
- state->sample_index = 0;
- }
-
- state->temp_samples[state->sample_index] = temp;
- state->sample_count++;
- state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
- if (state->sample_count < ARRAY_SIZE(state->temp_samples))
- goto re_arm;
-
- avg = 0;
- for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
- avg += state->temp_samples[i];
-
- avg /= ARRAY_SIZE(state->temp_samples);
-
- if (state->average > avg) {
- pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
- this_cpu,
- state->level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- state->rate_control_active = true;
- }
-
- state->average = avg;
-
-re_arm:
- clear_therm_status_log(state->level);
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
-}
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- */
-static void therm_throt_process(bool new_event, int event, int level)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- bool old_event;
- u64 now;
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-
- now = get_jiffies_64();
- if (level == CORE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->core_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->core_power_limit;
- else
- return;
- } else if (level == PACKAGE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->package_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->package_power_limit;
- else
- return;
- } else
- return;
-
- old_event = state->new_event;
- state->new_event = new_event;
-
- if (new_event)
- state->count++;
-
- if (event != THERMAL_THROTTLING_EVENT)
- return;
-
- if (new_event && !state->last_interrupt_time) {
- bool hot;
- u8 temp;
-
- get_therm_status(state->level, &hot, &temp);
- /*
- * Ignore short temperature spike as the system is not close
- * to PROCHOT. 10C offset is large enough to ignore. It is
- * already dropped from the high threshold temperature.
- */
- if (temp > 10)
- return;
-
- state->baseline_temp = temp;
- state->last_interrupt_time = now;
- schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
- } else if (old_event && state->last_interrupt_time) {
- unsigned long throttle_time;
-
- throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
- if (throttle_time > state->max_time_ms)
- state->max_time_ms = throttle_time;
- state->total_time_ms += throttle_time;
- state->last_interrupt_time = 0;
- }
-}
-
-static int thresh_event_valid(int level, int event)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- u64 now = get_jiffies_64();
-
- if (level == PACKAGE_LEVEL)
- state = (event == 0) ? &pstate->pkg_thresh0 :
- &pstate->pkg_thresh1;
- else
- state = (event == 0) ? &pstate->core_thresh0 :
- &pstate->core_thresh1;
-
- if (time_before64(now, state->next_check))
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
-
- return 1;
-}
-
-static bool int_pln_enable;
-static int __init int_pln_enable_setup(char *s)
-{
- int_pln_enable = true;
-
- return 1;
-}
-__setup("int_pln_enable", int_pln_enable_setup);
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
-{
- int err;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
- if (err)
- return err;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_core_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_max_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_total_time_ms.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_power_limit_count.attr,
- thermal_attr_group.name);
- if (err)
- goto del_group;
- }
- }
-
- return 0;
-
-del_group:
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-
- return err;
-}
-
-static void thermal_throttle_remove_dev(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int thermal_throttle_online(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- state->package_throttle.level = PACKAGE_LEVEL;
- state->core_throttle.level = CORE_LEVEL;
-
- INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
- INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
-
- /* Unmask the thermal vector after the above workqueues are initialized. */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- return thermal_throttle_add_dev(dev, cpu);
-}
-
-static int thermal_throttle_offline(unsigned int cpu)
-{
- struct thermal_state *state = &per_cpu(thermal_state, cpu);
- struct device *dev = get_cpu_device(cpu);
- u32 l;
-
- /* Mask the thermal vector before draining evtl. pending work */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
-
- cancel_delayed_work_sync(&state->package_throttle.therm_work);
- cancel_delayed_work_sync(&state->core_throttle.therm_work);
-
- state->package_throttle.rate_control_active = false;
- state->core_throttle.rate_control_active = false;
-
- thermal_throttle_remove_dev(dev);
- return 0;
-}
-
-static __init int thermal_throttle_init_device(void)
-{
- int ret;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
- thermal_throttle_online,
- thermal_throttle_offline);
- return ret < 0 ? ret : 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-static void notify_package_thresholds(__u64 msr_val)
-{
- bool notify_thres_0 = false;
- bool notify_thres_1 = false;
-
- if (!platform_thermal_package_notify)
- return;
-
- /* lower threshold check */
- if (msr_val & THERM_LOG_THRESHOLD0)
- notify_thres_0 = true;
- /* higher threshold check */
- if (msr_val & THERM_LOG_THRESHOLD1)
- notify_thres_1 = true;
-
- if (!notify_thres_0 && !notify_thres_1)
- return;
-
- if (platform_thermal_package_rate_control &&
- platform_thermal_package_rate_control()) {
- /* Rate control is implemented in callback */
- platform_thermal_package_notify(msr_val);
- return;
- }
-
- /* lower threshold reached */
- if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
- platform_thermal_package_notify(msr_val);
- /* higher threshold reached */
- if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
- platform_thermal_package_notify(msr_val);
-}
-
-static void notify_thresholds(__u64 msr_val)
-{
- /* check whether the interrupt handler is defined;
- * otherwise simply return
- */
- if (!platform_thermal_notify)
- return;
-
- /* lower threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD0) &&
- thresh_event_valid(CORE_LEVEL, 0))
- platform_thermal_notify(msr_val);
- /* higher threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD1) &&
- thresh_event_valid(CORE_LEVEL, 1))
- platform_thermal_notify(msr_val);
-}
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- if (static_cpu_has(X86_FEATURE_HWP))
- wrmsrl_safe(MSR_HWP_STATUS, 0);
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-
- /* Check for violation of core thermal thresholds*/
- notify_thresholds(msr_val);
-
- therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PTS)) {
- rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- /* check violations of package thermal thresholds */
- notify_package_thresholds(msr_val);
- therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL);
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val &
- PACKAGE_THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- PACKAGE_LEVEL);
- }
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
-{
- trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
- trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- ack_APIC_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return 0;
- return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- if (!intel_thermal_supported(c))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- h = lvtthmr_init;
- /*
- * The initial value of thermal LVT entries on all APs always reads
- * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
- * sequence to them and LVT registers are reset to 0s except for
- * the mask bits which are set to 1s when APs receive INIT IPI.
- * If BIOS takes over the thermal interrupt and sets its interrupt
- * delivery mode to SMI (not fixed), it restores the value that the
- * BIOS has programmed on AP based on BSP's info we saved since BIOS
- * is always setting the same value for all threads/cores.
- */
- if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
- apic_write(APIC_LVTTHMR, lvtthmr_init);
-
-
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- if (system_state == SYSTEM_BOOTING)
- pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- /* early Pentium M models use different method for enabling TM2 */
- if (cpu_has(c, X86_FEATURE_TM2)) {
- if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
- rdmsr(MSR_THERM2_CTL, l, h);
- if (l & MSR_THERM2_CTL_TM_SELECT)
- tm2 = 1;
- } else if (l & MSR_IA32_MISC_ENABLE_TM2)
- tm2 = 1;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- (l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- (l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE))
- & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE
- | PACKAGE_THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE), h);
- }
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ec6f0415bc6d..b935e1b5f115 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
-int __init microcode_init(void)
+static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 43b54bef5448..e88bc296afca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -31,6 +31,11 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+
+/* Is Linux running as the root partition? */
+bool hv_root_partition;
+EXPORT_SYMBOL_GPL(hv_root_partition);
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -226,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void)
hv_init_spinlocks();
#endif
}
+
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ int ret;
+#endif
+
+ native_smp_prepare_cpus(max_cpus);
+
+#ifdef CONFIG_X86_64
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ BUG_ON(ret);
+ }
+
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+ BUG_ON(ret);
+ }
+#endif
+}
#endif
static void __init ms_hyperv_init_platform(void)
@@ -243,6 +274,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
@@ -256,6 +288,22 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
+ * Check CPU management privilege.
+ *
+ * To mirror what Windows does we should extract CPU management
+ * features and use the ReservedIdentityBit to detect if Linux is the
+ * root partition. But that requires negotiating CPU management
+ * interface (a process to be finalized).
+ *
+ * For now, use the privilege flag as the indicator for running as
+ * root.
+ */
+ if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
+ hv_root_partition = true;
+ pr_info("Hyper-V: running as root partition\n");
+ }
+
+ /*
* Extract host information.
*/
if (cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS) >=
@@ -277,6 +325,14 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
+ if (ms_hyperv.features_b & HV_ISOLATION) {
+ ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+ }
+
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
@@ -366,6 +422,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+ if (hv_root_partition)
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5bd011737272..9231640782fa 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a29997e6cf9e..b90f3f437765 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
#include <linux/export.h>
#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 61eb26edc6d2..28c8a23aa42e 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index a5ee607a3b89..3ef5868ac588 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -3,7 +3,7 @@
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index ee71c47844cb..c4d320d02fd5 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -572,6 +572,7 @@ union cpuid_0x10_x_edx {
void rdt_last_cmd_clear(void);
void rdt_last_cmd_puts(const char *s);
+__printf(1, 2)
void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 460f3e0df106..f9190adc52cb 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -563,11 +563,11 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
*/
if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->closid, rdtgrp->closid);
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else if (rdtgrp->type == RDTMON_GROUP) {
if (rdtgrp->mon.parent->closid == tsk->closid) {
- tsk->rmid = rdtgrp->mon.rmid;
+ WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else {
rdt_last_cmd_puts("Can't move task to different control group\n");
return -EINVAL;
@@ -2310,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- t->closid = to->closid;
- t->rmid = to->mon.rmid;
+ WRITE_ONCE(t->closid, to->closid);
+ WRITE_ONCE(t->rmid, to->mon.rmid);
-#ifdef CONFIG_SMP
/*
- * This is safe on x86 w/o barriers as the ordering
- * of writing to task_cpu() and t->on_cpu is
- * reverse to the reading here. The detection is
- * inaccurate as tasks might move or schedule
- * before the smp function call takes place. In
- * such a case the function call is pointless, but
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
* there is no other side effect.
*/
- if (mask && t->on_cpu)
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
-#endif
}
}
read_unlock(&tasklist_lock);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 236924930bf0..972ec3bfa9c0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
- { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
- { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
- { X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
- { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index f65564a94b9b..7449ef33f081 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
struct sgx_encl_page *entry;
unsigned long phys_addr;
struct sgx_encl *encl;
- unsigned long pfn;
vm_fault_t ret;
encl = vma->vm_private_data;
@@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
- /* Check if another thread got here first to insert the PTE. */
- if (!follow_pfn(vma, addr, &pfn)) {
- mutex_unlock(&encl->lock);
-
- return VM_FAULT_NOPAGE;
- }
-
ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
if (ret != VM_FAULT_NOPAGE) {
mutex_unlock(&encl->lock);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index c519fc5f6948..8df81a3ed945 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void)
return true;
}
-static void __init sgx_init(void)
+static int __init sgx_init(void)
{
int ret;
int i;
if (!cpu_feature_enabled(X86_FEATURE_SGX))
- return;
+ return -ENODEV;
if (!sgx_page_cache_init())
- return;
+ return -ENOMEM;
- if (!sgx_page_reclaimer_init())
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
goto err_page_cache;
+ }
ret = sgx_drv_init();
if (ret)
goto err_kthread;
- return;
+ return 0;
err_kthread:
kthread_stop(ksgxd_tsk);
@@ -728,6 +730,8 @@ err_page_cache:
vfree(sgx_epc_sections[i].pages);
memunmap(sgx_epc_sections[i].virt_addr);
}
+
+ return ret;
}
device_initcall(sgx_init);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 1dd851397bd9..5601b95944fa 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -128,12 +128,21 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
- unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
+ unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin;
/*
- * This is a software stack, so 'end' can be a valid stack pointer.
- * It just means the stack is empty.
+ * @end points directly to the top most stack entry to avoid a -8
+ * adjustment in the stack switch hotpath. Adjust it back before
+ * calculating @begin.
+ */
+ end++;
+ begin = end - (IRQ_STACK_SIZE / sizeof(long));
+
+ /*
+ * Due to the switching logic RSP can never be == @end because the
+ * final operation is 'popq %rsp' which means after that RSP points
+ * to the original stack and not to @end.
*/
if (stack < begin || stack >= end)
return false;
@@ -143,8 +152,9 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
info->end = end;
/*
- * The next stack pointer is the first thing pushed by the entry code
- * after switching to the irq stack.
+ * The next stack pointer is stored at the top of the irq stack
+ * before switching to the irq stack. Actual stack entries are all
+ * below that.
*/
info->next_sp = (unsigned long *)*(end - 1);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 5d8047441a0a..683749b80ae2 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
fx->fop = 0;
fx->rip = 0;
fx->rdp = 0;
- memset(&fx->st_space[0], 0, 128);
+ memset(fx->st_space, 0, sizeof(fx->st_space));
}
/*
* SSE is in init state
*/
if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(&fx->xmm_space[0], 0, 256);
+ memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
/*
* First two features are FPU and SSE, which above we handled
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 0d54099c2a3a..7c273846c687 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -184,6 +184,7 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
* It is also used to copy the retq for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
+ UNWIND_HINT_FUNC
retq
SYM_FUNC_END(ftrace_epilogue)
@@ -276,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
restore_mcount_regs 8
/* Restore flags */
popfq
- UNWIND_HINT_RET_OFFSET
+ UNWIND_HINT_FUNC
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
@@ -333,8 +334,7 @@ SYM_FUNC_START(ftrace_graph_caller)
retq
SYM_FUNC_END(ftrace_graph_caller)
-SYM_CODE_START(return_to_handler)
- UNWIND_HINT_EMPTY
+SYM_FUNC_START(return_to_handler)
subq $24, %rsp
/* Save the return values */
@@ -349,5 +349,5 @@ SYM_CODE_START(return_to_handler)
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC rdi
-SYM_CODE_END(return_to_handler)
+SYM_FUNC_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c5dd50369e2f..58aa712973ac 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -21,6 +21,7 @@
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/thermal.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -227,7 +228,7 @@ static __always_inline void handle_irq(struct irq_desc *desc,
struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_X86_64))
- run_irq_on_irqstack_cond(desc->handle_irq, desc, regs);
+ generic_handle_irq_desc(desc);
else
__handle_irq(desc, regs);
}
@@ -374,3 +375,23 @@ void fixup_irqs(void)
}
}
#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+static void smp_thermal_vector(void)
+{
+ if (x86_thermal_enabled())
+ intel_thermal_interrupt();
+ else
+ pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+ smp_processor_id());
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
+{
+ trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
+ inc_irq_stat(irq_thermal_count);
+ smp_thermal_vector();
+ trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
+ ack_APIC_irq();
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 0b79efc87be5..044902d5a3c4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -22,6 +22,7 @@
#include <asm/apic.h>
#include <asm/nospec-branch.h>
+#include <asm/softirq_stack.h>
#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 440eed558558..1c0fb96b9e39 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
+#include <asm/softirq_stack.h>
#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -48,7 +49,8 @@ static int map_irq_stack(unsigned int cpu)
if (!va)
return -ENOMEM;
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -60,7 +62,8 @@ static int map_irq_stack(unsigned int cpu)
{
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
+ /* Store actual TOS to avoid adjustment in the hotpath */
+ per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
@@ -71,8 +74,3 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
}
-
-void do_softirq_own_stack(void)
-{
- run_on_irqstack_cond(__do_softirq, NULL);
-}
diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S
index 0db0375235b4..8ef35063964b 100644
--- a/arch/x86/kernel/irqflags.S
+++ b/arch/x86/kernel/irqflags.S
@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
-
-/*
- * void native_restore_fl(unsigned long flags)
- * %eax/%rdi: flags
- */
-SYM_FUNC_START(native_restore_fl)
- push %_ASM_ARG1
- popf
- ret
-SYM_FUNC_END(native_restore_fl)
-EXPORT_SYMBOL(native_restore_fl)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a65e9e97857f..df776cdca327 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -133,26 +133,6 @@ void synthesize_relcall(void *dest, void *from, void *to)
NOKPROBE_SYMBOL(synthesize_relcall);
/*
- * Skip the prefixes of the instruction.
- */
-static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
-{
- insn_attr_t attr;
-
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- while (inat_is_legacy_prefix(attr)) {
- insn++;
- attr = inat_get_opcode_attribute((insn_byte_t)*insn);
- }
-#ifdef CONFIG_X86_64
- if (inat_is_rex_prefix(attr))
- insn++;
-#endif
- return insn;
-}
-NOKPROBE_SYMBOL(skip_prefixes);
-
-/*
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
@@ -312,25 +292,6 @@ static int can_probe(unsigned long paddr)
}
/*
- * Returns non-zero if opcode modifies the interrupt flag.
- */
-static int is_IF_modifier(kprobe_opcode_t *insn)
-{
- /* Skip prefixes */
- insn = skip_prefixes(insn);
-
- switch (*insn) {
- case 0xfa: /* cli */
- case 0xfb: /* sti */
- case 0xcf: /* iret/iretd */
- case 0x9d: /* popf/popfd */
- return 1;
- }
-
- return 0;
-}
-
-/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
* addressing mode. Note that since @real will be the final place of copied
@@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
len += JMP32_INSN_SIZE;
- p->ainsn.boostable = true;
+ p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = false;
+ p->ainsn.boostable = 0;
}
return len;
@@ -450,6 +411,67 @@ void free_insn_page(void *page)
module_memfree(page);
}
+static void set_resume_flags(struct kprobe *p, struct insn *insn)
+{
+ insn_byte_t opcode = insn->opcode.bytes[0];
+
+ switch (opcode) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0x9d: /* popf/popfd */
+ /* Check whether the instruction modifies Interrupt Flag or not */
+ p->ainsn.if_modifier = 1;
+ break;
+ case 0x9c: /* pushfl */
+ p->ainsn.is_pushf = 1;
+ break;
+ case 0xcf: /* iret */
+ p->ainsn.if_modifier = 1;
+ fallthrough;
+ case 0xc2: /* ret/lret */
+ case 0xc3:
+ case 0xca:
+ case 0xcb:
+ case 0xea: /* jmp absolute -- ip is correct */
+ /* ip is already adjusted, no more changes required */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ p->ainsn.is_call = 1;
+ break;
+#ifdef CONFIG_X86_32
+ case 0x9a: /* call absolute -- same as call absolute, indirect */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+#endif
+ case 0xff:
+ opcode = insn->opcode.bytes[1];
+ if ((opcode & 0x30) == 0x10) {
+ /*
+ * call absolute, indirect
+ * Fix return addr; ip is correct.
+ * But this is not boostable
+ */
+ p->ainsn.is_call = 1;
+ p->ainsn.is_abs_ip = 1;
+ break;
+ } else if (((opcode & 0x31) == 0x20) ||
+ ((opcode & 0x31) == 0x21)) {
+ /*
+ * jmp near and far, absolute indirect
+ * ip is correct.
+ */
+ p->ainsn.is_abs_ip = 1;
+ /* Without resume jump, this is boostable */
+ p->ainsn.boostable = 1;
+ }
+ break;
+ }
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
@@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p)
*/
len = prepare_boost(buf, p, &insn);
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = is_IF_modifier(buf);
+ /* Analyze the opcode and set resume flags */
+ set_resume_flags(p, &insn);
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
+
+ memset(&p->ainsn, 0, sizeof(p->ainsn));
+
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler);
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
- *
- * If this is the first time we've single-stepped the instruction at
- * this probepoint, and the instruction is boostable, boost it: add a
- * jump instruction after the copied instruction, that jumps to the next
- * instruction after the probepoint.
*/
static void resume_execution(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
@@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
unsigned long orig_ip = (unsigned long)p->addr;
- kprobe_opcode_t *insn = p->ainsn.insn;
-
- /* Skip prefixes */
- insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
- switch (*insn) {
- case 0x9c: /* pushfl */
+
+ /* Fixup the contents of top of stack */
+ if (p->ainsn.is_pushf) {
*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
*tos |= kcb->kprobe_old_flags;
- break;
- case 0xc2: /* iret/ret/lret */
- case 0xc3:
- case 0xca:
- case 0xcb:
- case 0xcf:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.boostable = true;
- goto no_change;
- case 0xe8: /* call relative - Fix return addr */
+ } else if (p->ainsn.is_call) {
*tos = orig_ip + (*tos - copy_ip);
- break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
-#endif
- case 0xff:
- if ((insn[1] & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- *tos = orig_ip + (*tos - copy_ip);
- goto no_change;
- } else if (((insn[1] & 0x31) == 0x20) ||
- ((insn[1] & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct. And this is boostable
- */
- p->ainsn.boostable = true;
- goto no_change;
- }
- break;
- default:
- break;
}
- regs->ip += orig_ip - copy_ip;
+ if (!p->ainsn.is_abs_ip)
+ regs->ip += orig_ip - copy_ip;
-no_change:
restore_btf();
}
NOKPROBE_SYMBOL(resume_execution);
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 373e5fa3ce1f..51c7f5271aee 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -12,7 +12,7 @@
#include "common.h"
-/* Ftrace callback handler for kprobes -- called under preepmt disabed */
+/* Ftrace callback handler for kprobes -- called under preepmt disabled */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5e78e01ca3b4..78bb0fae3982 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -836,28 +836,25 @@ static void kvm_kick_cpu(int cpu)
static void kvm_wait(u8 *ptr, u8 val)
{
- unsigned long flags;
-
if (in_nmi())
return;
- local_irq_save(flags);
-
- if (READ_ONCE(*ptr) != val)
- goto out;
-
/*
* halt until it's our turn and kicked. Note that we do safe halt
* for irq enabled case to avoid hang when lock info is overwritten
* in irq spinlock slowpath and no spurious interrupt occur to save us.
*/
- if (arch_irqs_disabled_flags(flags))
- halt();
- else
- safe_halt();
+ if (irqs_disabled()) {
+ if (READ_ONCE(*ptr) == val)
+ halt();
+ } else {
+ local_irq_disable();
-out:
- local_irq_restore(flags);
+ if (READ_ONCE(*ptr) == val)
+ safe_halt();
+
+ local_irq_enable();
+ }
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index aa593743acf6..1fc0962c89c0 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -268,21 +268,20 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
-#ifdef CONFIG_X86_64
- u8 flags;
+ kvmclock_init_mem();
- if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall)
- return 0;
+#ifdef CONFIG_X86_64
+ if (per_cpu(hv_clock_per_cpu, 0) && kvmclock_vsyscall) {
+ u8 flags;
- flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
- if (!(flags & PVCLOCK_TSC_STABLE_BIT))
- return 0;
+ flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT))
+ return 0;
- kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ kvm_clock.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
+ }
#endif
- kvmclock_init_mem();
-
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b8aee71840ae..aa15132228da 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
- tlb_gather_mmu(&tlb, mm, start, end);
+ /*
+ * Although free_pgd_range() is intended for freeing user
+ * page-tables, it also works out for kernel mappings on x86.
+ * We use tlb_gather_mmu_fullmm() to avoid confusing the
+ * range-tracking logic in __tlb_adjust_range().
+ */
+ tlb_gather_mmu_fullmm(&tlb, mm);
free_pgd_range(&tlb, start, end, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ tlb_finish_mmu(&tlb);
#endif
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4ac..5e9a34b5bd74 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
+ case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 8a67d1fa8dc5..ed8ac6bcbafb 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = security_locked_down(LOCKDOWN_MSR);
if (err)
break;
+
+ err = filter_write(regs[1]);
+ if (err)
+ return err;
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 6c3407ba6ee9..c60222ab8ab9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
- else if (type == PARAVIRT_PATCH(cpu.iret) ||
- type == PARAVIRT_PATCH(cpu.usergs_sysret64))
+ else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
-extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
- .cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
- .irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
index ace6e334cb39..abd27ec67397 100644
--- a/arch/x86/kernel/paravirt_patch.c
+++ b/arch/x86/kernel/paravirt_patch.c
@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
- const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
- const unsigned char cpu_usergs_sysret64[6];
- const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
- 0x48, 0x0f, 0x07 }, // swapgs; sysretq
- .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
- PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
- PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 2e9006c1e240..42e92ec62973 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -4,9 +4,6 @@
#include <linux/string.h>
#include <linux/kallsyms.h>
-
-#define DEBUG 1
-
static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..9c214d7085a4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
#endif
/* Kernel thread ? */
- if (unlikely(p->flags & PF_KTHREAD)) {
+ if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad582f9ac5a6..d08307df69ad 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(irq_count) != -1);
+ this_cpu_read(hardirq_stack_inuse));
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bedca011459c..87a4143aa7d7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
+#ifdef CONFIG_X86_64
+static const struct user_regset_view user_x86_64_view; /* Initialized below. */
+#endif
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
+#ifdef CONFIG_X86_64
+ /* This is native 64-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_64_view;
+#else
+ /* This is native 32-bit ptrace() */
+ const struct user_regset_view *regset_view = &user_x86_32_view;
+#endif
+
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
- task_user_regset_view(current),
+ &user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
+/*
+ * This is used by the core dump code to decide which regset to dump. The
+ * core dump code writes out the resulting .e_machine and the corresponding
+ * regsets. This is suboptimal if the task is messing around with its CS.L
+ * field, but at worst the core dump will end up missing some information.
+ *
+ * Unfortunately, it is also used by the broken PTRACE_GETREGSET and
+ * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
+ * no way to make sure that the e_machine they use matches the caller's
+ * expectations. The result is that the data format returned by
+ * PTRACE_GETREGSET depends on the returned CS field (and even the offset
+ * of the returned CS field depends on its value!) and the data format
+ * accepted by PTRACE_SETREGSET is determined by the old CS value. The
+ * upshot is that it is basically impossible to use these APIs correctly.
+ *
+ * The best way to fix it in the long run would probably be to add new
+ * improved ptrace() APIs to read and write registers reliably, possibly by
+ * allowing userspace to select the ELF e_machine variant that they expect.
+ */
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index db115943e8bd..b29657b76e3f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
+ { /* PCIe Wifi card isn't detected after reboot otherwise */
+ .callback = set_pci_reboot,
+ .ident = "Zotac ZBOX CI327 nano",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NA"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
+ },
+ },
+
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
@@ -538,31 +547,21 @@ static void emergency_vmx_disable_all(void)
local_irq_disable();
/*
- * We need to disable VMX on all CPUs before rebooting, otherwise
- * we risk hanging up the machine, because the CPU ignores INIT
- * signals when VMX is enabled.
- *
- * We can't take any locks and we may be on an inconsistent
- * state, so we use NMIs as IPIs to tell the other CPUs to disable
- * VMX and halt.
+ * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
+ * the machine, because the CPU blocks INIT when it's in VMX root.
*
- * For safety, we will avoid running the nmi_shootdown_cpus()
- * stuff unnecessarily, but we don't have a way to check
- * if other CPUs have VMX enabled. So we will call it only if the
- * CPU we are running on has VMX enabled.
+ * We can't take any locks and we may be on an inconsistent state, so
+ * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
*
- * We will miss cases where VMX is not enabled on all CPUs. This
- * shouldn't do much harm because KVM always enable VMX on all
- * CPUs anyway. But we can miss it on the small window where KVM
- * is still enabling VMX.
+ * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
+ * doesn't prevent a different CPU from being in VMX root operation.
*/
- if (cpu_has_vmx() && cpu_vmx_enabled()) {
- /* Disable VMX on this CPU. */
- cpu_vmxoff();
+ if (cpu_has_vmx()) {
+ /* Safely force _this_ CPU out of VMX root operation. */
+ __cpu_emergency_vmxoff();
- /* Halt and disable VMX on the other CPUs */
+ /* Halt and exit VMX root operation on the other CPUs. */
nmi_shootdown_cpus(vmxoff_nmi);
-
}
}
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 740f3bdb3f61..d883176ef2ce 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -16,7 +16,6 @@
#include <linux/memblock.h>
#include <linux/pci.h>
#include <linux/root_dev.h>
-#include <linux/sfi.h>
#include <linux/hugetlb.h>
#include <linux/tboot.h>
#include <linux/usb/xhci-dbgp.h>
@@ -1185,7 +1184,6 @@ void __init setup_arch(char **cmdline_p)
* Read APIC and some other early information from ACPI tables.
*/
acpi_boot_init();
- sfi_init();
x86_dtb_init();
/*
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 84c1821819af..04a780abb512 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -121,8 +121,18 @@ static void __init setup_vc_stacks(int cpu)
cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
}
-static __always_inline bool on_vc_stack(unsigned long sp)
+static __always_inline bool on_vc_stack(struct pt_regs *regs)
{
+ unsigned long sp = regs->sp;
+
+ /* User-mode RSP is not trusted */
+ if (user_mode(regs))
+ return false;
+
+ /* SYSCALL gap still has user-mode RSP */
+ if (ip_within_syscall_gap(regs))
+ return false;
+
return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
}
@@ -144,7 +154,7 @@ void noinstr __sev_es_ist_enter(struct pt_regs *regs)
old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
/* Make room on the IST stack */
- if (on_vc_stack(regs->sp))
+ if (on_vc_stack(regs))
new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
else
new_ist = old_ist - sizeof(old_ist);
@@ -248,7 +258,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
int res;
if (user_mode(ctxt->regs)) {
- res = insn_fetch_from_user(ctxt->regs, buffer);
+ res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
if (!res) {
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
@@ -1248,13 +1258,12 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
{
struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
- lockdep_assert_irqs_disabled();
-
/*
* Handle #DB before calling into !noinstr code to avoid recursive #DB.
*/
@@ -1263,6 +1272,8 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
return;
}
+ irq_state = irqentry_nmi_enter(regs);
+ lockdep_assert_irqs_disabled();
instrumentation_begin();
/*
@@ -1325,6 +1336,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
out:
instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
return;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..f306e85a08a6 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -766,30 +766,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
{
- /*
- * This function is fundamentally broken as currently
- * implemented.
- *
- * The idea is that we want to trigger a call to the
- * restart_block() syscall and that we want in_ia32_syscall(),
- * in_x32_syscall(), etc. to match whatever they were in the
- * syscall being restarted. We assume that the syscall
- * instruction at (regs->ip - 2) matches whatever syscall
- * instruction we used to enter in the first place.
- *
- * The problem is that we can get here when ptrace pokes
- * syscall-like values into regs even if we're not in a syscall
- * at all.
- *
- * For now, we maintain historical behavior and guess based on
- * stored state. We could do better by saving the actual
- * syscall arch in restart_block or (with caveats on x32) by
- * checking if regs->ip points to 'int $0x80'. The current
- * behavior is incorrect if a tracer has a different bitness
- * than the tracee.
- */
#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (current->restart_block.arch_data & TS_COMPAT)
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index ca9a380d9c0b..9442c4136c38 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
+/*
+ * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
+ * The REX.W cancels the effect of any data16.
+ */
+static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
+
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
+ const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
+ if (func == &__static_call_return0) {
+ emulate = code;
+ code = &xor5rax;
+ }
+
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
- text_poke_bp(insn, code, size, NULL);
+ text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
+ !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 504fa5425bce..660b78827638 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
- long error;
- error = -EINVAL;
if (off & ~PAGE_MASK)
- goto out;
+ return -EINVAL;
- error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
-out:
- return error;
+ return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
static void find_start_end(unsigned long addr, unsigned long flags,
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7f5aec758f0e..ac1874a2a70e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -694,8 +694,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* In the SYSCALL entry path the RSP value comes from user-space - don't
* trust it and switch to the current kernel stack
*/
- if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
- regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) {
+ if (ip_within_syscall_gap(regs)) {
sp = this_cpu_read(cpu_current_top_of_stack);
goto sync;
}
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..a1202536fc57 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -13,7 +13,7 @@
#define orc_warn_current(args...) \
({ \
- if (state->task == current) \
+ if (state->task == current && !state->error) \
orc_warn(args); \
})
@@ -367,8 +367,8 @@ static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
return false;
- *ip = regs->ip;
- *sp = regs->sp;
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@@ -380,8 +380,8 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
return false;
- *ip = regs->ip;
- *sp = regs->sp;
+ *ip = READ_ONCE_NOCHECK(regs->ip);
+ *sp = READ_ONCE_NOCHECK(regs->sp);
return true;
}
@@ -402,12 +402,12 @@ static bool get_reg(struct unwind_state *state, unsigned int reg_off,
return false;
if (state->full_regs) {
- *val = ((unsigned long *)state->regs)[reg];
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->regs)[reg]);
return true;
}
if (state->prev_regs) {
- *val = ((unsigned long *)state->prev_regs)[reg];
+ *val = READ_ONCE_NOCHECK(((unsigned long *)state->prev_regs)[reg]);
return true;
}
@@ -471,7 +471,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_SP_INDIRECT:
- sp = state->sp + orc->sp_offset;
+ sp = state->sp;
indirect = true;
break;
@@ -521,6 +521,9 @@ bool unwind_next_frame(struct unwind_state *state)
if (indirect) {
if (!deref_stack_reg(state, sp, &sp))
goto err;
+
+ if (orc->sp_reg == ORC_REG_SP_INDIRECT)
+ sp += orc->sp_offset;
}
/* Find IP, SP and possibly regs: */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 764573de3996..e5a7a10a0164 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
- unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
+
+ /*
+ * Don't write screen_bitmap in case some user had a value there
+ * and expected it to remain unchanged.
+ */
user_access_end();
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
}
-static void mark_screen_rdonly(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- spinlock_t *ptl;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i;
-
- mmap_write_lock(mm);
- pgd = pgd_offset(mm, 0xA0000);
- if (pgd_none_or_clear_bad(pgd))
- goto out;
- p4d = p4d_offset(pgd, 0xA0000);
- if (p4d_none_or_clear_bad(p4d))
- goto out;
- pud = pud_offset(p4d, 0xA0000);
- if (pud_none_or_clear_bad(pud))
- goto out;
- pmd = pmd_offset(pud, 0xA0000);
-
- if (pmd_trans_huge(*pmd)) {
- vma = find_vma(mm, 0xA0000);
- split_huge_pmd(vma, pmd, 0xA0000);
- }
- if (pmd_none_or_clear_bad(pmd))
- goto out;
- pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
- for (i = 0; i < 32; i++) {
- if (pte_present(*pte))
- set_pte(pte, pte_wrprotect(*pte));
- pte++;
- }
- pte_unmap_unlock(pte, ptl);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
-}
-
-
-
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
+
+ /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
+ if (v.flags & VM86_SCREEN_BITMAP) {
+ char comm[TASK_COMM_LEN];
+
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
+ return -EINVAL;
+ }
+
memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx;
@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs;
vm86->flags = v.flags;
- vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored,
@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk);
preempt_enable();
- if (vm86->flags & VM86_SCREEN_BITMAP)
- mark_screen_rdonly(tsk->mm);
-
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax;
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7ac592664c52..a788d5120d4d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -103,6 +103,15 @@ config KVM_AMD_SEV
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
+config KVM_XEN
+ bool "Support for Xen hypercall interface"
+ depends on KVM
+ help
+ Provides KVM support for the hosting Xen HVM guests and
+ passing Xen hypercalls to userspace.
+
+ If in doubt, say "N".
+
config KVM_MMU_AUDIT
bool "Audit KVM MMU"
depends on KVM && TRACEPOINTS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4bd14ab01323..1b4766fe1de2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -17,7 +17,9 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o \
- mmu/spte.o mmu/tdp_iter.o mmu/tdp_mmu.o
+ mmu/spte.o
+kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
+kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 38172ca627d3..6bd2f8b830e4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -173,16 +173,22 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_update_pv_runtime(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- kvm_mmu_reset_context(vcpu);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
kvm_pmu_refresh(vcpu);
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ kvm_hv_set_cpuid(vcpu);
/* Invoke the vendor callback only after the above state is updated. */
- kvm_x86_ops.vcpu_after_set_cpuid(vcpu);
+ static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
+
+ /*
+ * Except for the MMU, which needs to be reset after any vendor
+ * specific adjustments to the reserved GPA bits.
+ */
+ kvm_mmu_reset_context(vcpu);
}
static int is_efer_nx(void)
@@ -223,6 +229,16 @@ not_found:
return 36;
}
+/*
+ * This "raw" version returns the reserved GPA bits without any adjustments for
+ * encryption technologies that usurp bits. The raw mask should be used if and
+ * only if hardware does _not_ strip the usurped bits, e.g. in virtual MTRRs.
+ */
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
+{
+ return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+}
+
/* when an old userspace process fills a new kernel module */
int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
@@ -392,7 +408,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_7_0_EBX,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
- F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
+ F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/
@@ -434,7 +450,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dc921d76e42e..2a0c5064497f 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -30,15 +30,32 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool exact_only);
int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu);
static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
{
return vcpu->arch.maxphyaddr;
}
+static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return !(gpa & vcpu->arch.reserved_gpa_bits);
+}
+
static inline bool kvm_vcpu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+ return !kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu,
+ gpa_t gpa, gpa_t alignment)
+{
+ return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa);
+}
+
+static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE);
}
struct cpuid_reg {
@@ -324,11 +341,6 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature)
kvm_cpu_cap_set(x86_feature);
}
-static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu,
unsigned int kvm_feature)
{
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66a08322988f..f7970ba6219f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2506,12 +2506,12 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
val = GET_SMSTATE(u32, smstate, 0x7fcc);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
val = GET_SMSTATE(u32, smstate, 0x7fc8);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
selector = GET_SMSTATE(u32, smstate, 0x7fc4);
@@ -2564,14 +2564,14 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
- val = GET_SMSTATE(u32, smstate, 0x7f68);
+ val = GET_SMSTATE(u64, smstate, 0x7f68);
- if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 6, val))
return X86EMUL_UNHANDLEABLE;
- val = GET_SMSTATE(u32, smstate, 0x7f60);
+ val = GET_SMSTATE(u64, smstate, 0x7f60);
- if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1))
+ if (ctxt->ops->set_dr(ctxt, 7, val))
return X86EMUL_UNHANDLEABLE;
cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
@@ -4329,7 +4329,7 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_dr(ctxt, 6, &dr6);
dr6 &= ~DR_TRAP_BITS;
- dr6 |= DR6_BD | DR6_RTM;
+ dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
return emulate_db(ctxt);
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 922c69dcca4d..f98370a39936 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,6 +23,7 @@
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
@@ -36,6 +37,9 @@
#include "trace.h"
#include "irq.h"
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -128,7 +132,7 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
synic_update_vector(synic, vector);
/* Load SynIC vectors into EOI exit bitmap */
- kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, hv_synic_to_vcpu(synic));
return 0;
}
@@ -141,10 +145,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx)
return NULL;
vcpu = kvm_get_vcpu(kvm, vpidx);
- if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (vcpu && kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
kvm_for_each_vcpu(i, vcpu, kvm)
- if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx)
+ if (kvm_hv_get_vpindex(vcpu) == vpidx)
return vcpu;
return NULL;
}
@@ -155,17 +159,17 @@ static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx)
struct kvm_vcpu_hv_synic *synic;
vcpu = get_vcpu_by_vpidx(kvm, vpidx);
- if (!vcpu)
+ if (!vcpu || !to_hv_vcpu(vcpu))
return NULL;
- synic = vcpu_to_synic(vcpu);
+ synic = to_hv_synic(vcpu);
return (synic->active) ? synic : NULL;
}
static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
int gsi, idx;
@@ -189,8 +193,8 @@ static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
hv_vcpu->exit.u.synic.msr = msr;
@@ -204,7 +208,7 @@ static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
u32 msr, u64 data, bool host)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int ret;
if (!synic->active && !host)
@@ -282,8 +286,7 @@ static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu)
static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL)
hv->hv_syndbg.control.status =
@@ -293,8 +296,8 @@ static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu)
static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG;
hv_vcpu->exit.u.syndbg.msr = msr;
@@ -310,13 +313,13 @@ static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr)
static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data);
+ to_hv_vcpu(vcpu)->vp_index, msr, data);
switch (msr) {
case HV_X64_MSR_SYNDBG_CONTROL:
syndbg->control.control = data;
@@ -349,7 +352,7 @@ static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu) && !host)
return 1;
@@ -377,9 +380,7 @@ static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
break;
}
- trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id,
- vcpu_to_hv_vcpu(vcpu)->vp_index, msr,
- *pdata);
+ trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, kvm_hv_get_vpindex(vcpu), msr, *pdata);
return 0;
}
@@ -421,7 +422,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
struct kvm_lapic_irq irq;
int ret, vector;
@@ -457,7 +458,7 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint)
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
int i;
trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
@@ -514,15 +515,15 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
u64 tsc;
/*
- * The guest has not set up the TSC page or the clock isn't
- * stable, fall back to get_kvmclock_ns.
+ * Fall back to get_kvmclock_ns() when TSC page hasn't been set up,
+ * is broken, disabled or being updated.
*/
- if (!hv->tsc_ref.tsc_sequence)
+ if (hv->hv_tsc_page_status != HV_TSC_PAGE_SET)
return div_u64(get_kvmclock_ns(kvm), 100);
vcpu = kvm_get_vcpu(kvm, 0);
@@ -534,10 +535,10 @@ static u64 get_time_ref_counter(struct kvm *kvm)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
set_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
if (vcpu_kick)
kvm_vcpu_kick(vcpu);
@@ -545,14 +546,14 @@ static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
- trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_cleanup(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
hrtimer_cancel(&stimer->timer);
clear_bit(stimer->index,
- vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ to_hv_vcpu(vcpu)->stimer_pending_bitmap);
stimer->msg_pending = false;
stimer->exp_time = 0;
}
@@ -562,7 +563,7 @@ static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
struct kvm_vcpu_hv_stimer *stimer;
stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
- trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_callback(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index);
stimer_mark_pending(stimer, true);
@@ -579,7 +580,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
u64 time_now;
ktime_t ktime_now;
- time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ time_now = get_time_ref_counter(hv_stimer_to_vcpu(stimer)->kvm);
ktime_now = ktime_get();
if (stimer->config.periodic) {
@@ -596,7 +597,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
stimer->exp_time = time_now + stimer->count;
trace_kvm_hv_stimer_start_periodic(
- stimer_to_vcpu(stimer)->vcpu_id,
+ hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->exp_time);
@@ -618,7 +619,7 @@ static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
return 0;
}
- trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_start_one_shot(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index,
time_now, stimer->count);
@@ -633,13 +634,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
{
union hv_stimer_config new_config = {.as_uint64 = config},
old_config = {.as_uint64 = stimer->config.as_uint64};
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_config(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, config, host);
stimer_cleanup(stimer);
@@ -657,13 +658,13 @@ static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
bool host)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
+ struct kvm_vcpu_hv_synic *synic = to_hv_synic(vcpu);
if (!synic->active && !host)
return 1;
- trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_set_count(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, count, host);
stimer_cleanup(stimer);
@@ -694,7 +695,7 @@ static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
struct hv_message *src_msg, bool no_retry)
{
- struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
int msg_off = offsetof(struct hv_message_page, sint_message[sint]);
gfn_t msg_page_gfn;
struct hv_message_header hv_hdr;
@@ -750,7 +751,7 @@ static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct hv_message *msg = &stimer->msg;
struct hv_timer_message_payload *payload =
(struct hv_timer_message_payload *)&msg->u.payload;
@@ -763,14 +764,14 @@ static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
payload->expiration_time = stimer->exp_time;
payload->delivery_time = get_time_ref_counter(vcpu->kvm);
- return synic_deliver_msg(vcpu_to_synic(vcpu),
+ return synic_deliver_msg(to_hv_synic(vcpu),
stimer->config.sintx, msg,
no_retry);
}
static int stimer_notify_direct(struct kvm_vcpu_hv_stimer *stimer)
{
- struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct kvm_vcpu *vcpu = hv_stimer_to_vcpu(stimer);
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
.vector = stimer->config.apic_vector
@@ -790,7 +791,7 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
r = stimer_send_msg(stimer);
else
r = stimer_notify_direct(stimer);
- trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ trace_kvm_hv_stimer_expiration(hv_stimer_to_vcpu(stimer)->vcpu_id,
stimer->index, direct, r);
if (!r) {
stimer->msg_pending = false;
@@ -801,11 +802,14 @@ static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_vcpu_hv_stimer *stimer;
u64 time_now, exp_time;
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
stimer = &hv_vcpu->stimer[i];
@@ -831,16 +835,27 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
int i;
+ if (!hv_vcpu)
+ return;
+
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_cleanup(&hv_vcpu->stimer[i]);
+
+ kfree(hv_vcpu);
+ vcpu->arch.hyperv = NULL;
}
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!(hv_vcpu->hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE))
return false;
return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
}
@@ -880,28 +895,41 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
stimer_prepare_msg(stimer);
}
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu;
int i;
+ hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
+ if (!hv_vcpu)
+ return -ENOMEM;
+
+ vcpu->arch.hyperv = hv_vcpu;
+ hv_vcpu->vcpu = vcpu;
+
synic_init(&hv_vcpu->synic);
bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_init(&hv_vcpu->stimer[i], i);
-}
-
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu)
-{
- struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+
+ return 0;
}
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
{
- struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv_synic *synic;
+ int r;
+
+ if (!to_hv_vcpu(vcpu)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
+ synic = to_hv_synic(vcpu);
/*
* Hyper-V SynIC auto EOI SINT's are
@@ -939,10 +967,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
return r;
}
-static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 *pdata)
+static int kvm_hv_msr_get_crash_data(struct kvm *kvm, u32 index, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -952,41 +979,26 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
-static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+static int kvm_hv_msr_get_crash_ctl(struct kvm *kvm, u64 *pdata)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
*pdata = hv->hv_crash_ctl;
return 0;
}
-static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+static int kvm_hv_msr_set_crash_ctl(struct kvm *kvm, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
-
- if (host)
- hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
- if (!host && (data & HV_CRASH_CTL_CRASH_NOTIFY)) {
-
- vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
- hv->hv_crash_param[0],
- hv->hv_crash_param[1],
- hv->hv_crash_param[2],
- hv->hv_crash_param[3],
- hv->hv_crash_param[4]);
-
- /* Send notification about crash to user space */
- kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
- }
+ hv->hv_crash_ctl = data & HV_CRASH_CTL_CRASH_NOTIFY;
return 0;
}
-static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
- u32 index, u64 data)
+static int kvm_hv_msr_set_crash_data(struct kvm *kvm, u32 index, u64 data)
{
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
size_t size = ARRAY_SIZE(hv->hv_crash_param);
if (WARN_ON_ONCE(index >= size))
@@ -1065,20 +1077,36 @@ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
return true;
}
+/*
+ * Don't touch TSC page values if the guest has opted for TSC emulation after
+ * migration. KVM doesn't fully support reenlightenment notifications and TSC
+ * access emulation and Hyper-V is known to expect the values in TSC page to
+ * stay constant before TSC access emulation is disabled from guest side
+ * (HV_X64_MSR_TSC_EMULATION_STATUS). KVM userspace is expected to preserve TSC
+ * frequency and guest visible TSC value across migration (and prevent it when
+ * TSC scaling is unsupported).
+ */
+static inline bool tsc_page_update_unsafe(struct kvm_hv *hv)
+{
+ return (hv->hv_tsc_page_status != HV_TSC_PAGE_GUEST_CHANGED) &&
+ hv->hv_tsc_emulation_control;
+}
+
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
u32 tsc_seq;
u64 gfn;
BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0);
- if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET)
return;
- mutex_lock(&kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
goto out_unlock;
@@ -1089,7 +1117,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
*/
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
&tsc_seq, sizeof(tsc_seq))))
+ goto out_err;
+
+ if (tsc_seq && tsc_page_update_unsafe(hv)) {
+ if (kvm_read_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
goto out_unlock;
+ }
/*
* While we're computing and writing the parameters, force the
@@ -1098,15 +1134,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- goto out_unlock;
+ goto out_err;
if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
- goto out_unlock;
+ goto out_err;
/* Ensure sequence is zero before writing the rest of the struct. */
smp_wmb();
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
- goto out_unlock;
+ goto out_err;
/*
* Now switch to the TSC page mechanism by writing the sequence.
@@ -1119,17 +1155,54 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
smp_wmb();
hv->tsc_ref.tsc_sequence = tsc_seq;
- kvm_write_guest(kvm, gfn_to_gpa(gfn),
- &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ goto out_err;
+
+ hv->hv_tsc_page_status = HV_TSC_PAGE_SET;
+ goto out_unlock;
+
+out_err:
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
out_unlock:
- mutex_unlock(&kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
+}
+
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm)
+{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+ u64 gfn;
+
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_BROKEN ||
+ hv->hv_tsc_page_status == HV_TSC_PAGE_UNSET ||
+ tsc_page_update_unsafe(hv))
+ return;
+
+ mutex_lock(&hv->hv_lock);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
+ /* Preserve HV_TSC_PAGE_GUEST_CHANGED/HV_TSC_PAGE_HOST_CHANGED states */
+ if (hv->hv_tsc_page_status == HV_TSC_PAGE_SET)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UPDATING;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ hv->hv_tsc_page_status = HV_TSC_PAGE_BROKEN;
+
+out_unlock:
+ mutex_unlock(&hv->hv_lock);
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1139,9 +1212,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
break;
case HV_X64_MSR_HYPERCALL: {
- u64 gfn;
- unsigned long addr;
- u8 instructions[4];
+ u8 instructions[9];
+ int i = 0;
+ u64 addr;
/* if guest os id is not set hypercall should remain disabled */
if (!hv->hv_guest_os_id)
@@ -1150,29 +1223,67 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_hypercall = data;
break;
}
- gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- kvm_x86_ops.patch_hypercall(vcpu, instructions);
- ((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (__copy_to_user((void __user *)addr, instructions, 4))
+
+ /*
+ * If Xen and Hyper-V hypercalls are both enabled, disambiguate
+ * the same way Xen itself does, by setting the bit 31 of EAX
+ * which is RsvdZ in the 32-bit Hyper-V hypercall ABI and just
+ * going to be clobbered on 64-bit.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ /* orl $0x80000000, %eax */
+ instructions[i++] = 0x0d;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x00;
+ instructions[i++] = 0x80;
+ }
+
+ /* vmcall/vmmcall */
+ static_call(kvm_x86_patch_hypercall)(vcpu, instructions + i);
+ i += 3;
+
+ /* ret */
+ ((unsigned char *)instructions)[i++] = 0xc3;
+
+ addr = data & HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK;
+ if (kvm_vcpu_write_guest(vcpu, addr, instructions, i))
return 1;
hv->hv_hypercall = data;
- mark_page_dirty(kvm, gfn);
break;
}
case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
- if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE) {
+ if (!host)
+ hv->hv_tsc_page_status = HV_TSC_PAGE_GUEST_CHANGED;
+ else
+ hv->hv_tsc_page_status = HV_TSC_PAGE_HOST_CHANGED;
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+ } else {
+ hv->hv_tsc_page_status = HV_TSC_PAGE_UNSET;
+ }
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_set_crash_data(vcpu,
+ return kvm_hv_msr_set_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
data);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ if (host)
+ return kvm_hv_msr_set_crash_ctl(kvm, data);
+
+ if (data & HV_CRASH_CTL_CRASH_NOTIFY) {
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+ break;
case HV_X64_MSR_RESET:
if (data == 1) {
vcpu_debug(vcpu, "hyper-v reset requested\n");
@@ -1186,6 +1297,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
hv->hv_tsc_emulation_control = data;
break;
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ if (data && !host)
+ return 1;
+
hv->hv_tsc_emulation_status = data;
break;
case HV_X64_MSR_TIME_REF_COUNT:
@@ -1216,11 +1330,11 @@ static u64 current_task_runtime_100ns(void)
static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
- struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
int vcpu_idx = kvm_vcpu_get_idx(vcpu);
u32 new_vp_index = (u32)data;
@@ -1291,14 +1405,14 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ return synic_set_msr(to_hv_synic(vcpu), msr, data, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_config(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1307,7 +1421,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_set_count(to_hv_stimer(vcpu, timer_index),
data, host);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1330,7 +1444,7 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
{
u64 data = 0;
struct kvm *kvm = vcpu->kvm;
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
@@ -1346,11 +1460,11 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = hv->hv_tsc_page;
break;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
- return kvm_hv_msr_get_crash_data(vcpu,
+ return kvm_hv_msr_get_crash_data(kvm,
msr - HV_X64_MSR_CRASH_P0,
pdata);
case HV_X64_MSR_CRASH_CTL:
- return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ return kvm_hv_msr_get_crash_ctl(kvm, pdata);
case HV_X64_MSR_RESET:
data = 0;
break;
@@ -1379,7 +1493,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
bool host)
{
u64 data = 0;
- struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
switch (msr) {
case HV_X64_MSR_VP_INDEX:
@@ -1403,14 +1517,14 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
+ return synic_get_msr(to_hv_synic(vcpu), msr, pdata, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
case HV_X64_MSR_STIMER3_CONFIG: {
int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
- return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_config(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_STIMER0_COUNT:
@@ -1419,7 +1533,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_STIMER3_COUNT: {
int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
- return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ return stimer_get_count(to_hv_stimer(vcpu, timer_index),
pdata);
}
case HV_X64_MSR_TSC_FREQUENCY:
@@ -1438,12 +1552,22 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1451,12 +1575,22 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+
+ if (!host && !vcpu->arch.hyperv_enabled)
+ return 1;
+
+ if (!to_hv_vcpu(vcpu)) {
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
+ }
+
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_lock(&hv->hv_lock);
r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host);
- mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
+ mutex_unlock(&hv->hv_lock);
return r;
} else
return kvm_hv_get_msr(vcpu, msr, pdata, host);
@@ -1466,7 +1600,7 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask,
u64 *vp_bitmap, unsigned long *vcpu_bitmap)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct kvm_vcpu *vcpu;
int i, bank, sbank = 0;
@@ -1483,18 +1617,16 @@ static __always_inline unsigned long *sparse_set_to_vcpu_mask(
bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index,
- (unsigned long *)vp_bitmap))
+ if (test_bit(kvm_hv_get_vpindex(vcpu), (unsigned long *)vp_bitmap))
__set_bit(i, vcpu_bitmap);
}
return vcpu_bitmap;
}
-static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa,
- u16 rep_cnt, bool ex)
+static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, u64 ingpa, u16 rep_cnt, bool ex)
{
- struct kvm *kvm = current_vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1592,10 +1724,10 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
}
}
-static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa,
+static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, u64 ingpa, u64 outgpa,
bool ex, bool fast)
{
- struct kvm *kvm = current_vcpu->kvm;
+ struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1666,9 +1798,20 @@ ret_success:
return HV_STATUS_SUCCESS;
}
-bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
{
- return READ_ONCE(kvm->arch.hyperv.hv_guest_os_id) != 0;
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0);
+ if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX)
+ vcpu->arch.hyperv_enabled = true;
+ else
+ vcpu->arch.hyperv_enabled = false;
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hyperv_enabled && to_kvm_hv(vcpu->kvm)->hv_guest_os_id;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
@@ -1698,6 +1841,7 @@ static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
{
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
struct eventfd_ctx *eventfd;
if (unlikely(!fast)) {
@@ -1726,7 +1870,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, bool fast, u64 param)
/* the eventfd is protected by vcpu->kvm->srcu, but conn_to_evt isn't */
rcu_read_lock();
- eventfd = idr_find(&vcpu->kvm->arch.hyperv.conn_to_evt, param);
+ eventfd = idr_find(&hv->conn_to_evt, param);
rcu_read_unlock();
if (!eventfd)
return HV_STATUS_INVALID_PORT_ID;
@@ -1745,7 +1889,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
* hypercall generates UD from non zero cpl and real mode
* per HYPER-V spec
*/
- if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || !is_protmode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -1793,7 +1937,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
fallthrough; /* maybe userspace knows this conn_id */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
- if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
+ if (unlikely(rep || !to_hv_synic(vcpu)->active)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
@@ -1855,7 +1999,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
}
fallthrough;
case HVCALL_RESET_DEBUG_SESSION: {
- struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu);
+ struct kvm_hv_syndbg *syndbg = to_hv_syndbg(vcpu);
if (!kvm_hv_is_syndbg_enabled(vcpu)) {
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
@@ -1885,23 +2029,26 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
void kvm_hv_init_vm(struct kvm *kvm)
{
- mutex_init(&kvm->arch.hyperv.hv_lock);
- idr_init(&kvm->arch.hyperv.conn_to_evt);
+ struct kvm_hv *hv = to_kvm_hv(kvm);
+
+ mutex_init(&hv->hv_lock);
+ idr_init(&hv->conn_to_evt);
}
void kvm_hv_destroy_vm(struct kvm *kvm)
{
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int i;
- idr_for_each_entry(&kvm->arch.hyperv.conn_to_evt, eventfd, i)
+ idr_for_each_entry(&hv->conn_to_evt, eventfd, i)
eventfd_ctx_put(eventfd);
- idr_destroy(&kvm->arch.hyperv.conn_to_evt);
+ idr_destroy(&hv->conn_to_evt);
}
static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
int ret;
@@ -1925,7 +2072,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
static int kvm_hv_eventfd_deassign(struct kvm *kvm, u32 conn_id)
{
- struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_hv *hv = to_kvm_hv(kvm);
struct eventfd_ctx *eventfd;
mutex_lock(&hv->hv_lock);
@@ -1997,8 +2144,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
break;
case HYPERV_CPUID_INTERFACE:
- memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
- ent->eax = signature[0];
+ ent->eax = HYPERV_CPUID_SIGNATURE_EAX;
break;
case HYPERV_CPUID_VERSION:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 6d7def2b0aad..60547d5cb6d7 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -50,38 +50,46 @@
/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */
#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2)
-static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv *to_kvm_hv(struct kvm *kvm)
{
- return &vcpu->arch.hyperv;
+ return &kvm->arch.hyperv;
}
-static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+static inline struct kvm_vcpu_hv *to_hv_vcpu(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_arch *arch;
-
- arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
- return container_of(arch, struct kvm_vcpu, arch);
+ return vcpu->arch.hyperv;
}
-static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+static inline struct kvm_vcpu_hv_synic *to_hv_synic(struct kvm_vcpu *vcpu)
{
- return &vcpu->arch.hyperv.synic;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return &hv_vcpu->synic;
}
-static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+static inline struct kvm_vcpu *hv_synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
{
- return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+ struct kvm_vcpu_hv *hv_vcpu = container_of(synic, struct kvm_vcpu_hv, synic);
+
+ return hv_vcpu->vcpu;
}
-static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu)
+static inline struct kvm_hv_syndbg *to_hv_syndbg(struct kvm_vcpu *vcpu)
{
return &vcpu->kvm->arch.hyperv.hv_syndbg;
}
+static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+}
+
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
-bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+bool kvm_hv_hypercall_enabled(struct kvm_vcpu *vcpu);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
void kvm_hv_irq_routing_update(struct kvm *kvm);
@@ -89,32 +97,35 @@ int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
-void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
-void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
struct hv_vp_assist_page *assist_page);
-static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
- int timer_index)
+static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
{
- return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+ return &to_hv_vcpu(vcpu)->stimer[timer_index];
}
-static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+static inline struct kvm_vcpu *hv_stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
{
struct kvm_vcpu_hv *hv_vcpu;
hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
stimer[0]);
- return hv_vcpu_to_vcpu(hv_vcpu);
+ return hv_vcpu->vcpu;
}
static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
{
- return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ return !bitmap_empty(hv_vcpu->stimer_pending_bitmap,
HV_SYNIC_STIMER_COUNT);
}
@@ -122,9 +133,11 @@ void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
struct pvclock_vcpu_time_info *hv_clock);
+void kvm_hv_invalidate_tsc_page(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 814698e5b152..172b05343cfd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -14,6 +14,7 @@
#include "irq.h"
#include "i8254.h"
#include "x86.h"
+#include "xen.h"
/*
* check if there are pending timer events
@@ -56,6 +57,9 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (kvm_xen_has_interrupt(v))
+ return 1;
+
if (!kvm_apic_accept_pic_intr(v))
return 0;
@@ -110,6 +114,9 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
if (!lapic_in_kernel(v))
return v->arch.interrupt.nr;
+ if (kvm_xen_has_interrupt(v))
+ return v->kvm->arch.xen.upcall_vector;
+
if (irqchip_split(v->kvm)) {
int vector = v->arch.pending_external_vector;
@@ -143,8 +150,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
__kvm_migrate_apic_timer(vcpu);
__kvm_migrate_pit_timer(vcpu);
- if (kvm_x86_ops.migrate_timers)
- kvm_x86_ops.migrate_timers(vcpu);
+ static_call_cond(kvm_x86_migrate_timers)(vcpu);
}
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index a889563ad02d..2e11da2f5621 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg)
return 0;
if (!kvm_register_is_available(vcpu, reg))
- kvm_x86_ops.cache_reg(vcpu, reg);
+ static_call(kvm_x86_cache_reg)(vcpu, reg);
return vcpu->arch.regs[reg];
}
@@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
might_sleep(); /* on svm */
if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_PDPTR);
return vcpu->arch.walk_mmu->pdptrs[index];
}
@@ -118,7 +118,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
if ((tmask & vcpu->arch.cr0_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR0))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR0);
return vcpu->arch.cr0 & mask;
}
@@ -132,14 +132,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
if ((tmask & vcpu->arch.cr4_guest_owned_bits) &&
!kvm_register_is_available(vcpu, VCPU_EXREG_CR4))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR4);
return vcpu->arch.cr4 & mask;
}
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3);
+ static_call(kvm_x86_cache_reg)(vcpu, VCPU_EXREG_CR3);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 43c93ffa76ed..0d359115429a 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -205,7 +205,7 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43cceadd073e..cc369b9ad8f1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -91,8 +91,8 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
-struct static_key_deferred apic_hw_disabled __read_mostly;
-struct static_key_deferred apic_sw_disabled __read_mostly;
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
+__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
static inline int apic_enabled(struct kvm_lapic *apic)
{
@@ -290,9 +290,9 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
if (enabled != apic->sw_enabled) {
apic->sw_enabled = enabled;
if (enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
else
- static_key_slow_inc(&apic_sw_disabled.key);
+ static_branch_inc(&apic_sw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
@@ -484,7 +484,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
if (unlikely(vcpu->arch.apicv_active)) {
/* need to update RVI */
kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
@@ -515,7 +515,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* just set SVI.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu, vec);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, vec);
else {
++apic->isr_count;
BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
@@ -563,8 +563,8 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* and must be left alone.
*/
if (unlikely(vcpu->arch.apicv_active))
- kvm_x86_ops.hwapic_isr_update(vcpu,
- apic_find_highest_isr(apic));
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
+ apic_find_highest_isr(apic));
else {
--apic->isr_count;
BUG_ON(apic->isr_count < 0);
@@ -701,7 +701,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (apic->vcpu->arch.apicv_active)
- highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu);
+ highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);
if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
@@ -1090,7 +1090,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) {
+ if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
@@ -1245,7 +1245,8 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ if (to_hv_vcpu(apic->vcpu) &&
+ test_bit(vector, to_hv_synic(apic->vcpu)->vec_bitmap))
kvm_hv_synic_send_eoi(apic->vcpu, vector);
kvm_ioapic_send_eoi(apic, vector);
@@ -1641,7 +1642,16 @@ static void apic_timer_expired(struct kvm_lapic *apic, bool from_timer_fn)
}
if (kvm_use_posted_timer_interrupt(apic->vcpu)) {
- kvm_wait_lapic_expire(vcpu);
+ /*
+ * Ensure the guest's timer has truly expired before posting an
+ * interrupt. Open code the relevant checks to avoid querying
+ * lapic_timer_int_injected(), which will be false since the
+ * interrupt isn't yet injected. Waiting until after injecting
+ * is not an option since that won't help a posted interrupt.
+ */
+ if (vcpu->arch.apic->lapic_timer.expired_tscdeadline &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ __kvm_wait_lapic_expire(vcpu);
kvm_apic_inject_pending_timer_irqs(apic);
return;
}
@@ -1814,7 +1824,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic)
{
WARN_ON(preemptible());
WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- kvm_x86_ops.cancel_hv_timer(apic->vcpu);
+ static_call(kvm_x86_cancel_hv_timer)(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
@@ -1831,7 +1841,7 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!ktimer->tscdeadline)
return false;
- if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired))
+ if (static_call(kvm_x86_set_hv_timer)(vcpu, ktimer->tscdeadline, &expired))
return false;
ktimer->hv_timer_in_use = true;
@@ -2175,10 +2185,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
hrtimer_cancel(&apic->lapic_timer.timer);
if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
if (!apic->sw_enabled)
- static_key_slow_dec_deferred(&apic_sw_disabled);
+ static_branch_slow_dec_deferred(&apic_sw_disabled);
if (apic->regs)
free_page((unsigned long)apic->regs);
@@ -2250,9 +2260,9 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- static_key_slow_dec_deferred(&apic_hw_disabled);
+ static_branch_slow_dec_deferred(&apic_hw_disabled);
} else {
- static_key_slow_inc(&apic_hw_disabled.key);
+ static_branch_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
}
@@ -2261,7 +2271,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE))
- kvm_x86_ops.set_virtual_apic_mode(vcpu);
+ static_call(kvm_x86_set_virtual_apic_mode)(vcpu);
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
@@ -2338,9 +2348,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu, -1);
- kvm_x86_ops.hwapic_isr_update(vcpu, -1);
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu, -1);
+ static_call(kvm_x86_hwapic_isr_update)(vcpu, -1);
}
vcpu->arch.apic_arb_prio = 0;
@@ -2449,7 +2459,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
* thinking that APIC state has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
- static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
+ static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -2512,7 +2522,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
*/
apic_clear_irr(vector, apic);
- if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ if (to_hv_vcpu(vcpu) && test_bit(vector, to_hv_synic(vcpu)->auto_eoi_bitmap)) {
/*
* For auto-EOI interrupts, there might be another pending
* interrupt above PPR, so check whether to raise another
@@ -2594,6 +2604,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_update_ppr(apic);
hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
@@ -2601,10 +2612,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- kvm_x86_ops.apicv_post_state_restore(vcpu);
- kvm_x86_ops.hwapic_irr_update(vcpu,
+ static_call(kvm_x86_apicv_post_state_restore)(vcpu);
+ static_call(kvm_x86_hwapic_irr_update)(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops.hwapic_isr_update(vcpu,
+ static_call(kvm_x86_hwapic_isr_update)(vcpu,
apic_find_highest_isr(apic));
}
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -2904,13 +2915,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
}
}
-void kvm_lapic_init(void)
-{
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&apic_hw_disabled, HZ);
- jump_label_rate_limit(&apic_sw_disabled, HZ);
-}
-
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 4fb86e3a9dd3..997c45a5963a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -6,6 +6,8 @@
#include <linux/kvm_host.h>
+#include "hyperv.h"
+
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
#define KVM_APIC_LVT_NUM 6
@@ -125,13 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-}
-
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
-void kvm_lapic_init(void);
void kvm_lapic_exit(void);
#define VEC_POS(v) ((v) & (32 - 1))
@@ -172,29 +168,29 @@ static inline void kvm_lapic_set_reg(struct kvm_lapic *apic, int reg_off, u32 va
__kvm_lapic_set_reg(apic->regs, reg_off, val);
}
-extern struct static_key kvm_no_apic_vcpu;
+DECLARE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
{
- if (static_key_false(&kvm_no_apic_vcpu))
+ if (static_branch_unlikely(&kvm_has_noapic_vcpu))
return vcpu->arch.apic;
return true;
}
-extern struct static_key_deferred apic_hw_disabled;
+extern struct static_key_false_deferred apic_hw_disabled;
static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_hw_disabled.key))
+ if (static_branch_unlikely(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
return MSR_IA32_APICBASE_ENABLE;
}
-extern struct static_key_deferred apic_sw_disabled;
+extern struct static_key_false_deferred apic_sw_disabled;
static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
- if (static_key_false(&apic_sw_disabled.key))
+ if (static_branch_unlikely(&apic_sw_disabled.key))
return apic->sw_enabled;
return true;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 261be1d2032b..c68bfc3e2402 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -102,7 +102,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(root_hpa))
return;
- kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
+ static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa | kvm_get_active_pcid(vcpu),
vcpu->arch.mmu->shadow_root_level);
}
@@ -152,7 +152,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*
* TODO: introduce APIs to split these two cases.
*/
-static inline int is_writable_pte(unsigned long pte)
+static inline bool is_writable_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
}
@@ -174,8 +174,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned pte_access, unsigned pte_pkey,
unsigned pfec)
{
- int cpl = kvm_x86_ops.get_cpl(vcpu);
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ int cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
/*
* If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..d75524bc8423 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -190,7 +190,7 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
int ret = -ENOTSUPP;
if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range);
+ ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
if (ret)
kvm_flush_remote_tlbs(kvm);
@@ -844,17 +844,17 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
int i, count = 0;
if (!rmap_head->val) {
- rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+ rmap_printk("%p %llx 0->1\n", spte, *spte);
rmap_head->val = (unsigned long)spte;
} else if (!(rmap_head->val & 1)) {
- rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+ rmap_printk("%p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
- rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+ rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1]) {
count += PTE_LIST_EXT;
@@ -906,14 +906,14 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
pr_err("%s: %p 0->BUG\n", __func__, spte);
BUG();
} else if (!(rmap_head->val & 1)) {
- rmap_printk("%s: %p 1->0\n", __func__, spte);
+ rmap_printk("%p 1->0\n", spte);
if ((u64 *)rmap_head->val != spte) {
pr_err("%s: %p 1->BUG\n", __func__, spte);
BUG();
}
rmap_head->val = 0;
} else {
- rmap_printk("%s: %p many->many\n", __func__, spte);
+ rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
@@ -1115,7 +1115,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
if (pt_protect)
spte &= ~SPTE_MMU_WRITEABLE;
@@ -1142,7 +1142,7 @@ static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
- rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("spte %p %llx\n", sptep, *sptep);
MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
@@ -1165,7 +1165,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
* - W bit on ad-disabled SPTEs.
* Returns true iff any D or W bits were cleared.
*/
-static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1180,35 +1181,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
return flush;
}
-static bool spte_set_dirty(u64 *sptep)
-{
- u64 spte = *sptep;
-
- rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
-
- /*
- * Similar to the !kvm_x86_ops.slot_disable_log_dirty case,
- * do not bother adding back write access to pages marked
- * SPTE_AD_WRPROT_ONLY_MASK.
- */
- spte |= shadow_dirty_mask;
-
- return mmu_spte_update(sptep, spte);
-}
-
-static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- bool flush = false;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- if (spte_ad_enabled(*sptep))
- flush |= spte_set_dirty(sptep);
-
- return flush;
-}
-
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -1225,7 +1197,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
while (mask) {
@@ -1248,25 +1220,24 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
*
* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
*/
-void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask)
+static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
{
struct kvm_rmap_head *rmap_head;
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PG_LEVEL_4K, slot);
- __rmap_clear_dirty(kvm, rmap_head);
+ __rmap_clear_dirty(kvm, rmap_head, slot);
/* clear the first set bit */
mask &= mask - 1;
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
/**
* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
@@ -1282,19 +1253,15 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- if (kvm_x86_ops.enable_log_dirty_pt_masked)
- kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
- mask);
+ if (kvm_x86_ops.cpu_dirty_log_size)
+ kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
int kvm_cpu_dirty_log_size(void)
{
- if (kvm_x86_ops.cpu_dirty_log_size)
- return kvm_x86_ops.cpu_dirty_log_size();
-
- return 0;
+ return kvm_x86_ops.cpu_dirty_log_size;
}
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
@@ -1309,7 +1276,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn);
@@ -1324,14 +1291,15 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}
-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
+ rmap_printk("spte %p %llx.\n", sptep, *sptep);
pte_list_remove(rmap_head, sptep);
flush = true;
@@ -1344,7 +1312,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmap_head);
+ return kvm_zap_rmapp(kvm, rmap_head, slot);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1363,7 +1331,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ rmap_printk("spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1456,16 +1424,17 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-static int kvm_handle_hva_range(struct kvm *kvm,
- unsigned long start,
- unsigned long end,
- unsigned long data,
- int (*handler)(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot,
- gfn_t gfn,
- int level,
- unsigned long data))
+static __always_inline int
+kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -1521,7 +1490,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
r = kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_zap_hva_range(kvm, start, end);
return r;
@@ -1533,7 +1502,7 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
r = kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
r |= kvm_tdp_mmu_set_spte_hva(kvm, hva, &pte);
return r;
@@ -1588,7 +1557,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
int young = false;
young = kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_age_hva_range(kvm, start, end);
return young;
@@ -1599,7 +1568,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
int young = false;
young = kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
young |= kvm_tdp_mmu_test_age_hva(kvm, hva);
return young;
@@ -1723,13 +1692,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
return 0;
}
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *pte)
-{
- WARN_ON(1);
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -2016,9 +1978,9 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
flush |= kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
- if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
- cond_resched_lock(&vcpu->kvm->mmu_lock);
+ cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}
@@ -2417,7 +2379,7 @@ static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
return 0;
restart:
- list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) {
+ list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
/*
* Don't zap active root pages, the page itself can't be freed
* and zapping it will just force vCPUs to realloc and reload.
@@ -2470,7 +2432,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
*/
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages -
@@ -2481,7 +2443,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -2492,7 +2454,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
@@ -2500,11 +2462,25 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
+
+ return r;
+}
+
+static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+ int r;
+
+ if (vcpu->arch.mmu->direct_map)
+ return 0;
+
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
+
+ r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
return r;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -2758,11 +2734,18 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
if (sp->role.level > PG_LEVEL_4K)
return;
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_notifier_count))
+ return;
+
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
- kvm_pfn_t pfn, struct kvm_memory_slot *slot)
+static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ struct kvm_memory_slot *slot)
{
unsigned long hva;
pte_t *pte;
@@ -2781,19 +2764,36 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
*/
hva = __gfn_to_hva_memslot(slot, gfn);
- pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
+ pte = lookup_address_in_mm(kvm->mm, hva, &level);
if (unlikely(!pte))
return PG_LEVEL_4K;
return level;
}
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t pfn, int max_level)
+{
+ struct kvm_lpage_info *linfo;
+
+ max_level = min(max_level, max_huge_page_level);
+ for ( ; max_level > PG_LEVEL_4K; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PG_LEVEL_4K)
+ return PG_LEVEL_4K;
+
+ return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+}
+
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
kvm_pfn_t pfn = *pfnp;
kvm_pfn_t mask;
int level;
@@ -2810,17 +2810,7 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- max_level = min(max_level, max_huge_page_level);
- for ( ; max_level > PG_LEVEL_4K; max_level--) {
- linfo = lpage_info_slot(gfn, slot, max_level);
- if (!linfo->disallow_lpage)
- break;
- }
-
- if (max_level == PG_LEVEL_4K)
- return PG_LEVEL_4K;
-
- level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
+ level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
if (level == PG_LEVEL_4K)
return level;
@@ -3161,7 +3151,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
if (kvm_mmu_put_root(kvm, sp)) {
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp))
kvm_tdp_mmu_free_root(kvm, sp);
else if (sp->role.invalid)
kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
@@ -3192,7 +3182,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return;
}
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3215,7 +3205,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3236,16 +3226,16 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
{
struct kvm_mmu_page *sp;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
if (make_mmu_pages_available(vcpu)) {
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return INVALID_PAGE;
}
sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL);
++sp->root_count;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return __pa(sp->spt);
}
@@ -3255,7 +3245,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
hpa_t root;
unsigned i;
- if (vcpu->kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
if (!VALID_PAGE(root))
@@ -3416,17 +3406,17 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
!smp_load_acquire(&sp->unsync_children))
return;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
mmu_sync_children(vcpu, sp);
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
for (i = 0; i < 4; ++i) {
@@ -3440,9 +3430,8 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
@@ -3658,8 +3647,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
- bool *writable)
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+ bool write, bool *writable)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3672,7 +3661,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
async = false;
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
+ write, writable, hva);
if (!async)
return false; /* *pfn has correct page already */
@@ -3686,7 +3676,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+ write, writable, hva);
return false;
}
@@ -3699,6 +3690,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
kvm_pfn_t pfn;
+ hva_t hva;
int r;
if (page_fault_handle_page_track(vcpu, error_code, gfn))
@@ -3717,15 +3709,21 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
+ write, &map_writable))
return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_lock(&vcpu->kvm->mmu_lock);
+ else
+ write_lock(&vcpu->kvm->mmu_lock);
+
+ if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
@@ -3739,7 +3737,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
prefault, is_tdp);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ if (is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))
+ read_unlock(&vcpu->kvm->mmu_lock);
+ else
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -3813,7 +3814,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
@@ -3984,20 +3984,27 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
static void
__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, int level, bool nx, bool gbpages,
+ u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
bool pse, bool amd)
{
- u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
+ u64 high_bits_rsvd;
rsvd_check->bad_mt_xwr = 0;
- if (!nx)
- exb_bit_rsvd = rsvd_bits(63, 63);
if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
+ if (level == PT32E_ROOT_LEVEL)
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62);
+ else
+ high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
+
+ /* Note, NX doesn't exist in PDPTEs, this is handled below. */
+ if (!nx)
+ high_bits_rsvd |= rsvd_bits(63, 63);
+
/*
* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
* leaf entries) on AMD CPUs only.
@@ -4026,45 +4033,39 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 63) |
- rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PDE */
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62); /* PTE */
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 62) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+ high_bits_rsvd |
+ rsvd_bits(5, 8) |
+ rsvd_bits(1, 2); /* PDPTE */
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_5LEVEL:
- rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
fallthrough;
case PT64_ROOT_4LEVEL:
- rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
- rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+ nonleaf_bit8_rsvd |
+ rsvd_bits(7, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+ gbpages_bit_rsvd;
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
rsvd_check->rsvd_bits_mask[1][3] =
rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 29);
- rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) |
- rsvd_bits(13, 20); /* large page */
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+ gbpages_bit_rsvd |
+ rsvd_bits(13, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+ rsvd_bits(13, 20); /* large page */
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
break;
@@ -4075,8 +4076,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), context->root_level,
- context->nx,
+ vcpu->arch.reserved_gpa_bits,
+ context->root_level, context->nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu),
guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4084,27 +4085,22 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
static void
__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- int maxphyaddr, bool execonly)
+ u64 pa_bits_rsvd, bool execonly)
{
+ u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 bad_mt_xwr;
- rsvd_check->rsvd_bits_mask[0][4] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][3] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- rsvd_check->rsvd_bits_mask[0][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+ rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6);
+ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
/* large page */
rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
- rsvd_check->rsvd_bits_mask[1][2] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
- rsvd_check->rsvd_bits_mask[1][1] =
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
+ rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29);
+ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20);
rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
@@ -4123,7 +4119,12 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
- cpuid_maxphyaddr(vcpu), execonly);
+ vcpu->arch.reserved_gpa_bits, execonly);
+}
+
+static inline u64 reserved_hpa_bits(void)
+{
+ return rsvd_bits(shadow_phys_bits, 63);
}
/*
@@ -4145,7 +4146,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
*/
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, uses_nx,
guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
is_pse(vcpu), true);
@@ -4182,14 +4183,13 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
- shadow_phys_bits,
+ reserved_hpa_bits(),
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
__reset_rsvds_bits_mask_ept(shadow_zero_check,
- shadow_phys_bits,
- false);
+ reserved_hpa_bits(), false);
if (!shadow_me_mask)
return;
@@ -4209,7 +4209,7 @@ reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
{
__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
- shadow_phys_bits, execonly);
+ reserved_hpa_bits(), execonly);
}
#define BYTE_MASK(access) \
@@ -4395,7 +4395,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
context->direct_map = false;
}
@@ -4424,7 +4423,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
@@ -4506,7 +4504,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
context->direct_map = true;
context->get_guest_pgd = get_cr3;
@@ -4678,7 +4675,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
- context->update_pte = ept_update_pte;
context->root_level = level;
context->direct_map = false;
context->mmu_role.as_u64 = new_role.as_u64;
@@ -4811,7 +4807,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
if (r)
goto out;
kvm_mmu_load_pgd(vcpu);
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
out:
return r;
}
@@ -4826,19 +4822,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *new)
-{
- if (sp->role.level != PG_LEVEL_4K) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
-}
-
static bool need_remote_flush(u64 old, u64 new)
{
if (!is_shadow_present_pte(old))
@@ -4954,22 +4937,6 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
return spte;
}
-/*
- * Ignore various flags when determining if a SPTE can be immediately
- * overwritten for the current MMU.
- * - level: explicitly checked in mmu_pte_write_new_pte(), and will never
- * match the current MMU role, as MMU's level tracks the root level.
- * - access: updated based on the new guest PTE
- * - quadrant: handled by get_written_sptes()
- * - invalid: always false (loop only walks valid shadow pages)
- */
-static const union kvm_mmu_page_role role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .invalid = 0x1,
-};
-
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
struct kvm_page_track_notifier_node *node)
@@ -4999,7 +4966,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
mmu_topup_memory_caches(vcpu, true);
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
@@ -5020,14 +4987,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
local_flush = true;
while (npte--) {
- u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
-
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
- if (gentry &&
- !((sp->role.word ^ base_role) & ~role_ign.word) &&
- rmap_can_add(vcpu))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -5035,24 +4998,8 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
- spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
-int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu->direct_map)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
- return r;
+ write_unlock(&vcpu->kvm->mmu_lock);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
@@ -5125,7 +5072,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
if (is_noncanonical_address(gva, vcpu))
return;
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
}
if (!mmu->invlpg)
@@ -5152,7 +5099,6 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
mmu->invlpg(vcpu, gva, root_hpa);
}
}
-EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
@@ -5182,7 +5128,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
}
if (tlb_flush)
- kvm_x86_ops.tlb_flush_gva(vcpu, gva);
+ static_call(kvm_x86_tlb_flush_gva)(vcpu, gva);
++vcpu->stat.invlpg;
@@ -5192,7 +5138,6 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
* for them.
*/
}
-EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
int tdp_huge_page_level)
@@ -5217,7 +5162,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
@@ -5231,16 +5177,16 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap);
+ flush |= fn(kvm, iterator.rmap, memslot);
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
kvm_flush_remote_tlbs_with_address(kvm,
start_gfn,
iterator.gfn - start_gfn + 1);
flush = false;
}
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
@@ -5265,22 +5211,6 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
-static __always_inline bool
-slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1,
- KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
-static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, bool lock_flush_tlb)
{
@@ -5390,7 +5320,7 @@ restart:
* be in active use by the guest.
*/
if (batch >= BATCH_ZAP_PAGES &&
- cond_resched_lock(&kvm->mmu_lock)) {
+ cond_resched_rwlock_write(&kvm->mmu_lock)) {
batch = 0;
goto restart;
}
@@ -5423,7 +5353,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
{
lockdep_assert_held(&kvm->slots_lock);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
trace_kvm_mmu_zap_all_fast(kvm);
/*
@@ -5447,10 +5377,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
kvm_zap_obsolete_pages(kvm);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
@@ -5492,7 +5422,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
@@ -5510,17 +5440,18 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
}
- if (kvm->arch.tdp_mmu_enabled) {
+ if (is_tdp_mmu_enabled(kvm)) {
flush = kvm_tdp_mmu_zap_gfn_range(kvm, gfn_start, gfn_end);
if (flush)
kvm_flush_remote_tlbs(kvm);
}
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot)
{
return __rmap_write_protect(kvm, rmap_head, false);
}
@@ -5531,12 +5462,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect,
start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_4K);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* We can flush all the TLBs out of the mmu lock without TLB
@@ -5554,7 +5485,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5575,8 +5507,8 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
- (kvm_is_zone_device_pfn(pfn) ||
- PageCompound(pfn_to_page(pfn)))) {
+ sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
+ pfn, PG_LEVEL_NUM)) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
@@ -5596,13 +5528,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
- spin_lock(&kvm->mmu_lock);
- slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
- kvm_mmu_zap_collapsible_spte, true);
+ struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
- if (kvm->arch.tdp_mmu_enabled)
- kvm_tdp_mmu_zap_collapsible_sptes(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
+ slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+
+ if (is_tdp_mmu_enabled(kvm))
+ kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
@@ -5625,11 +5558,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
{
bool flush;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
/*
* It's also safe to flush TLBs out of mmu lock here as currently this
@@ -5640,40 +5573,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
-
-void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
-{
- bool flush;
-
- spin_lock(&kvm->mmu_lock);
- flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
- false);
- if (kvm->arch.tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, PG_LEVEL_2M);
- spin_unlock(&kvm->mmu_lock);
-
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
-
-void kvm_mmu_slot_set_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
-{
- bool flush;
-
- spin_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
- if (kvm->arch.tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_slot_set_dirty(kvm, memslot);
- spin_unlock(&kvm->mmu_lock);
-
- if (flush)
- kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
void kvm_mmu_zap_all(struct kvm *kvm)
{
@@ -5681,23 +5580,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
LIST_HEAD(invalid_list);
int ign;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
if (WARN_ON(sp->role.invalid))
continue;
if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
goto restart;
- if (cond_resched_lock(&kvm->mmu_lock))
+ if (cond_resched_rwlock_write(&kvm->mmu_lock))
goto restart;
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (kvm->arch.tdp_mmu_enabled)
+ if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
@@ -5757,7 +5656,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
continue;
idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
if (kvm_has_zapped_obsolete_pages(kvm)) {
kvm_mmu_commit_zap_page(kvm,
@@ -5768,7 +5667,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan);
unlock:
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
/*
@@ -5988,7 +5887,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ulong to_zap;
rcu_idx = srcu_read_lock(&kvm->srcu);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
@@ -6005,22 +5904,22 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
struct kvm_mmu_page,
lpage_disallowed_link);
WARN_ON_ONCE(!sp->lpage_disallowed);
- if (sp->tdp_mmu_page)
+ if (is_tdp_mmu_page(sp)) {
kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn,
sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level));
- else {
+ } else {
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
WARN_ON_ONCE(sp->lpage_disallowed);
}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- cond_resched_lock(&kvm->mmu_lock);
+ cond_resched_rwlock_write(&kvm->mmu_lock);
}
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index c8d51a37e2ce..ced15fd58fde 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
}
static bool mmu_audit;
-static struct static_key mmu_audit_key;
+static DEFINE_STATIC_KEY_FALSE(mmu_audit_key);
static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
@@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
- if (static_key_false((&mmu_audit_key)))
+ if (static_branch_unlikely((&mmu_audit_key)))
__kvm_mmu_audit(vcpu, point);
}
@@ -259,7 +259,7 @@ static void mmu_audit_enable(void)
if (mmu_audit)
return;
- static_key_slow_inc(&mmu_audit_key);
+ static_branch_inc(&mmu_audit_key);
mmu_audit = true;
}
@@ -268,7 +268,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;
- static_key_slow_dec(&mmu_audit_key);
+ static_branch_dec(&mmu_audit_key);
mmu_audit = false;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bfc6389edc28..1f6f98c76bdf 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -12,7 +12,7 @@
extern bool dbg;
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
-#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
#define MMU_WARN_ON(x) WARN_ON(x)
#else
#define pgprintk(x...) do { } while (0)
@@ -56,7 +56,12 @@ struct kvm_mmu_page {
/* Number of writes since the last time traversal visited this page. */
atomic_t write_flooding_count;
+#ifdef CONFIG_X86_64
bool tdp_mmu_page;
+
+ /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+ struct rcu_head rcu_head;
+#endif
};
extern struct kmem_cache *mmu_page_header_cache;
@@ -73,15 +78,23 @@ static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
return to_shadow_page(__pa(sptep));
}
+static inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+{
+ return sp->role.smm ? 1 : 0;
+}
+
static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
{
/*
- * When using the EPT page-modification log, the GPAs in the log
- * would come from L2 rather than L1. Therefore, we need to rely
- * on write protection to record dirty pages. This also bypasses
- * PML, since writes now result in a vmexit.
+ * When using the EPT page-modification log, the GPAs in the CPU dirty
+ * log would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages, which bypasses PML, since
+ * writes now result in a vmexit. Note, the check on CPU dirty logging
+ * being enabled is mandatory as the bits used to denote WP-only SPTEs
+ * are reserved for NPT w/ PAE (32-bit KVM).
*/
- return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu &&
+ kvm_x86_ops.cpu_dirty_log_size;
}
bool is_nx_huge_page_enabled(void);
@@ -133,6 +146,8 @@ enum {
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
#define SET_SPTE_SPURIOUS BIT(2)
+int kvm_mmu_max_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t pfn, int max_level);
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
int max_level, kvm_pfn_t *pfnp,
bool huge_page_disallowed, int *req_level);
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 8443a675715b..34bb0ec69bd8 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -184,9 +184,9 @@ kvm_page_track_register_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
}
EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
@@ -202,9 +202,9 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
head = &kvm->arch.track_notifier_head;
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
hlist_del_rcu(&n->node);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
}
EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..55d7b473ac44 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -601,6 +601,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (sp->role.level > PG_LEVEL_4K)
return;
+ /*
+ * If addresses are being invalidated, skip prefetching to avoid
+ * accidentally prefetching those addresses.
+ */
+ if (unlikely(vcpu->kvm->mmu_notifier_count))
+ return;
+
if (sp->role.direct)
return __direct_pte_prefetch(vcpu, sp, sptep);
@@ -790,6 +797,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
+ hva_t hva;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
int max_level;
@@ -840,8 +848,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
- &map_writable))
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+ write_fault, &map_writable))
return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
@@ -868,8 +876,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
}
r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ write_lock(&vcpu->kvm->mmu_lock);
+ if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
@@ -881,7 +889,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return r;
}
@@ -919,7 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
return;
}
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
level = iterator.level;
sptep = iterator.sptep;
@@ -954,7 +962,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
}
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c51ad544f25b..ef55f0bc4ccf 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -120,7 +120,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
- spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn,
+ spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
if (host_writable)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 2b3a30bd38b0..6de3950fd704 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -131,6 +131,25 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT PT64_SECOND_AVAIL_BITS_SHIFT
/*
+ * If a thread running without exclusive control of the MMU lock must perform a
+ * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
+ * non-present intermediate value. Other threads which encounter this value
+ * should not modify the SPTE.
+ *
+ * This constant works because it is considered non-present on both AMD and
+ * Intel CPUs and does not create a L1TF vulnerability because the pfn section
+ * is zeroed out.
+ *
+ * Only used by the TDP MMU.
+ */
+#define REMOVED_SPTE (1ull << 59)
+
+static inline bool is_removed_spte(u64 spte)
+{
+ return spte == REMOVED_SPTE;
+}
+
+/*
* In some cases, we need to preserve the GFN of a non-present or reserved
* SPTE when we usurp the upper five bits of the physical address space to
* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
@@ -185,23 +204,19 @@ static inline bool is_access_track_spte(u64 spte)
return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
}
-static inline int is_shadow_present_pte(u64 pte)
+static inline bool is_shadow_present_pte(u64 pte)
{
- return (pte != 0) && !is_mmio_spte(pte);
+ return (pte != 0) && !is_mmio_spte(pte) && !is_removed_spte(pte);
}
-static inline int is_large_pte(u64 pte)
+static inline bool is_large_pte(u64 pte)
{
return pte & PT_PAGE_SIZE_MASK;
}
-static inline int is_last_spte(u64 pte, int level)
+static inline bool is_last_spte(u64 pte, int level)
{
- if (level == PG_LEVEL_4K)
- return 1;
- if (is_large_pte(pte))
- return 1;
- return 0;
+ return (level == PG_LEVEL_4K) || is_large_pte(pte);
}
static inline bool is_executable_pte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 87b7e16911db..b3ed302c1a35 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -12,7 +12,7 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
{
iter->sptep = iter->pt_path[iter->level - 1] +
SHADOW_PT_INDEX(iter->gfn << PAGE_SHIFT, iter->level);
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
}
static gfn_t round_gfn_for_level(gfn_t gfn, int level)
@@ -21,25 +21,37 @@ static gfn_t round_gfn_for_level(gfn_t gfn, int level)
}
/*
+ * Return the TDP iterator to the root PT and allow it to continue its
+ * traversal over the paging structure from there.
+ */
+void tdp_iter_restart(struct tdp_iter *iter)
+{
+ iter->yielded_gfn = iter->next_last_level_gfn;
+ iter->level = iter->root_level;
+
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ tdp_iter_refresh_sptep(iter);
+
+ iter->valid = true;
+}
+
+/*
* Sets a TDP iterator to walk a pre-order traversal of the paging structure
- * rooted at root_pt, starting with the walk to translate goal_gfn.
+ * rooted at root_pt, starting with the walk to translate next_last_level_gfn.
*/
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn)
+ int min_level, gfn_t next_last_level_gfn)
{
WARN_ON(root_level < 1);
WARN_ON(root_level > PT64_ROOT_MAX_LEVEL);
- iter->goal_gfn = goal_gfn;
+ iter->next_last_level_gfn = next_last_level_gfn;
iter->root_level = root_level;
iter->min_level = min_level;
- iter->level = root_level;
- iter->pt_path[iter->level - 1] = root_pt;
-
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
- tdp_iter_refresh_sptep(iter);
+ iter->pt_path[iter->root_level - 1] = (tdp_ptep_t)root_pt;
+ iter->as_id = kvm_mmu_page_as_id(sptep_to_sp(root_pt));
- iter->valid = true;
+ tdp_iter_restart(iter);
}
/*
@@ -47,7 +59,7 @@ void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
* address of the child page table referenced by the SPTE. Returns null if
* there is no such entry.
*/
-u64 *spte_to_child_pt(u64 spte, int level)
+tdp_ptep_t spte_to_child_pt(u64 spte, int level)
{
/*
* There's no child entry if this entry isn't present or is a
@@ -56,7 +68,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
if (!is_shadow_present_pte(spte) || is_last_spte(spte, level))
return NULL;
- return __va(spte_to_pfn(spte) << PAGE_SHIFT);
+ return (tdp_ptep_t)__va(spte_to_pfn(spte) << PAGE_SHIFT);
}
/*
@@ -65,7 +77,7 @@ u64 *spte_to_child_pt(u64 spte, int level)
*/
static bool try_step_down(struct tdp_iter *iter)
{
- u64 *child_pt;
+ tdp_ptep_t child_pt;
if (iter->level == iter->min_level)
return false;
@@ -74,7 +86,7 @@ static bool try_step_down(struct tdp_iter *iter)
* Reread the SPTE before stepping down to avoid traversing into page
* tables that are no longer linked from this entry.
*/
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
child_pt = spte_to_child_pt(iter->old_spte, iter->level);
if (!child_pt)
@@ -82,7 +94,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->goal_gfn, iter->level);
+ iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -106,9 +118,9 @@ static bool try_step_side(struct tdp_iter *iter)
return false;
iter->gfn += KVM_PAGES_PER_HPAGE(iter->level);
- iter->goal_gfn = iter->gfn;
+ iter->next_last_level_gfn = iter->gfn;
iter->sptep++;
- iter->old_spte = READ_ONCE(*iter->sptep);
+ iter->old_spte = READ_ONCE(*rcu_dereference(iter->sptep));
return true;
}
@@ -158,25 +170,3 @@ void tdp_iter_next(struct tdp_iter *iter)
iter->valid = false;
}
-/*
- * Restart the walk over the paging structure from the root, starting from the
- * highest gfn the iterator had previously reached. Assumes that the entire
- * paging structure, except the root page, may have been completely torn down
- * and rebuilt.
- */
-void tdp_iter_refresh_walk(struct tdp_iter *iter)
-{
- gfn_t goal_gfn = iter->goal_gfn;
-
- if (iter->gfn > goal_gfn)
- goal_gfn = iter->gfn;
-
- tdp_iter_start(iter, iter->pt_path[iter->root_level - 1],
- iter->root_level, iter->min_level, goal_gfn);
-}
-
-u64 *tdp_iter_root_pt(struct tdp_iter *iter)
-{
- return iter->pt_path[iter->root_level - 1];
-}
-
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index 47170d0dc98e..b1748b988d3a 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -7,6 +7,8 @@
#include "mmu.h"
+typedef u64 __rcu *tdp_ptep_t;
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
@@ -15,11 +17,17 @@ struct tdp_iter {
* The iterator will traverse the paging structure towards the mapping
* for this GFN.
*/
- gfn_t goal_gfn;
+ gfn_t next_last_level_gfn;
+ /*
+ * The next_last_level_gfn at the time when the thread last
+ * yielded. Only yielding when the next_last_level_gfn !=
+ * yielded_gfn helps ensure forward progress.
+ */
+ gfn_t yielded_gfn;
/* Pointers to the page tables traversed to reach the current SPTE */
- u64 *pt_path[PT64_ROOT_MAX_LEVEL];
+ tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
- u64 *sptep;
+ tdp_ptep_t sptep;
/* The lowest GFN mapped by the current SPTE */
gfn_t gfn;
/* The level of the root page given to the iterator */
@@ -28,6 +36,8 @@ struct tdp_iter {
int min_level;
/* The iterator's current level within the paging structure */
int level;
+ /* The address space ID, i.e. SMM vs. regular. */
+ int as_id;
/* A snapshot of the value at sptep */
u64 old_spte;
/*
@@ -49,12 +59,11 @@ struct tdp_iter {
#define for_each_tdp_pte(iter, root, root_level, start, end) \
for_each_tdp_pte_min_level(iter, root, root_level, PG_LEVEL_4K, start, end)
-u64 *spte_to_child_pt(u64 pte, int level);
+tdp_ptep_t spte_to_child_pt(u64 pte, int level);
void tdp_iter_start(struct tdp_iter *iter, u64 *root_pt, int root_level,
- int min_level, gfn_t goal_gfn);
+ int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
-void tdp_iter_refresh_walk(struct tdp_iter *iter);
-u64 *tdp_iter_root_pt(struct tdp_iter *iter);
+void tdp_iter_restart(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index b56d604809b8..462b1f71c77f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -7,32 +7,23 @@
#include "tdp_mmu.h"
#include "spte.h"
+#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-#ifdef CONFIG_X86_64
static bool __read_mostly tdp_mmu_enabled = false;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-#endif
-
-static bool is_tdp_mmu_enabled(void)
-{
-#ifdef CONFIG_X86_64
- return tdp_enabled && READ_ONCE(tdp_mmu_enabled);
-#else
- return false;
-#endif /* CONFIG_X86_64 */
-}
/* Initializes the TDP MMU for the VM, if enabled. */
void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
- if (!is_tdp_mmu_enabled())
+ if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
return;
/* This should not be changed for the lifetime of the VM. */
kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
+ spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
}
@@ -42,6 +33,12 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
return;
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
+
+ /*
+ * Ensure that all the outstanding RCU callbacks to free shadow pages
+ * can run before the VM is torn down.
+ */
+ rcu_barrier();
}
static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
@@ -53,7 +50,7 @@ static void tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
static inline bool tdp_mmu_next_root_valid(struct kvm *kvm,
struct kvm_mmu_page *root)
{
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
if (list_entry_is_head(root, &kvm->arch.tdp_mmu_roots, link))
return false;
@@ -88,22 +85,6 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
#define for_each_tdp_mmu_root(_kvm, _root) \
list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
-{
- struct kvm_mmu_page *sp;
-
- if (!kvm->arch.tdp_mmu_enabled)
- return false;
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- sp = to_shadow_page(hpa);
- if (WARN_ON(!sp))
- return false;
-
- return sp->tdp_mmu_page && sp->root_count;
-}
-
static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end, bool can_yield);
@@ -111,7 +92,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root)
{
gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(root->root_count);
WARN_ON(!root->tdp_mmu_page);
@@ -164,13 +145,13 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
- spin_lock(&kvm->mmu_lock);
+ write_lock(&kvm->mmu_lock);
/* Check for an existing root before allocating a new one. */
for_each_tdp_mmu_root(kvm, root) {
if (root->role.word == role.word) {
kvm_mmu_get_root(kvm, root);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
}
@@ -180,7 +161,7 @@ static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu)
list_add(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->mmu_lock);
+ write_unlock(&kvm->mmu_lock);
return root;
}
@@ -196,14 +177,32 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
return __pa(root->spt);
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level);
+static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
+{
+ free_page((unsigned long)sp->spt);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
-static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
+/*
+ * This is called through call_rcu in order to free TDP page table memory
+ * safely with respect to other kernel threads that may be operating on
+ * the memory.
+ * By only accessing TDP MMU page table memory in an RCU read critical
+ * section, and freeing it after a grace period, lockless access to that
+ * memory won't use it after it is freed.
+ */
+static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
{
- return sp->role.smm ? 1 : 0;
+ struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
+ rcu_head);
+
+ tdp_mmu_free_sp(sp);
}
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared);
+
static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
{
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
@@ -235,6 +234,144 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
/**
+ * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the new page
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ * @account_nx: This page replaces a NX large page and should be marked for
+ * eventual reclaim.
+ */
+static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared, bool account_nx)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
+ if (account_nx)
+ account_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
+ *
+ * @kvm: kvm instance
+ * @sp: the page to be removed
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be adding or removing pages.
+ */
+static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool shared)
+{
+ if (shared)
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ else
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ list_del(&sp->link);
+ if (sp->lpage_disallowed)
+ unaccount_huge_nx_page(kvm, sp);
+
+ if (shared)
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+}
+
+/**
+ * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
+ *
+ * @kvm: kvm instance
+ * @pt: the page removed from the paging structure
+ * @shared: This operation may not be running under the exclusive use
+ * of the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
+ *
+ * Given a page table that has been removed from the TDP paging structure,
+ * iterates through the page table to clear SPTEs and free child page tables.
+ *
+ * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
+ * protection. Since this thread removed it from the paging structure,
+ * this thread will be responsible for ensuring the page is freed. Hence the
+ * early rcu_dereferences in the function.
+ */
+static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
+ bool shared)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
+ int level = sp->role.level;
+ gfn_t base_gfn = sp->gfn;
+ u64 old_child_spte;
+ u64 *sptep;
+ gfn_t gfn;
+ int i;
+
+ trace_kvm_mmu_prepare_zap_page(sp);
+
+ tdp_mmu_unlink_page(kvm, sp, shared);
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ sptep = rcu_dereference(pt) + i;
+ gfn = base_gfn + (i * KVM_PAGES_PER_HPAGE(level - 1));
+
+ if (shared) {
+ /*
+ * Set the SPTE to a nonpresent value that other
+ * threads will not overwrite. If the SPTE was
+ * already marked as removed then another thread
+ * handling a page fault could overwrite it, so
+ * set the SPTE until it is set from some other
+ * value to the removed SPTE value.
+ */
+ for (;;) {
+ old_child_spte = xchg(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_child_spte))
+ break;
+ cpu_relax();
+ }
+ } else {
+ /*
+ * If the SPTE is not MMU-present, there is no backing
+ * page associated with the SPTE and so no side effects
+ * that need to be recorded, and exclusive ownership of
+ * mmu_lock ensures the SPTE can't be made present.
+ * Note, zapping MMIO SPTEs is also unnecessary as they
+ * are guarded by the memslots generation, not by being
+ * unreachable.
+ */
+ old_child_spte = READ_ONCE(*sptep);
+ if (!is_shadow_present_pte(old_child_spte))
+ continue;
+
+ /*
+ * Marking the SPTE as a removed SPTE is not
+ * strictly necessary here as the MMU lock will
+ * stop other threads from concurrently modifying
+ * this SPTE. Using the removed SPTE value keeps
+ * the two branches consistent and simplifies
+ * the function.
+ */
+ WRITE_ONCE(*sptep, REMOVED_SPTE);
+ }
+ handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
+ old_child_spte, REMOVED_SPTE, level - 1,
+ shared);
+ }
+
+ kvm_flush_remote_tlbs_with_address(kvm, gfn,
+ KVM_PAGES_PER_HPAGE(level));
+
+ call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
+}
+
+/**
* handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
@@ -242,22 +379,22 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
* @old_spte: The value of the SPTE before the change
* @new_spte: The value of the SPTE after the change
* @level: the level of the PT the SPTE is part of in the paging structure
+ * @shared: This operation may not be running under the exclusive use of
+ * the MMU lock and the operation must synchronize with other
+ * threads that might be modifying SPTEs.
*
* Handle bookkeeping that might result from the modification of a SPTE.
* This function must be called for all TDP SPTE modifications.
*/
static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
bool was_leaf = was_present && is_last_spte(old_spte, level);
bool is_leaf = is_present && is_last_spte(new_spte, level);
bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
- u64 *pt;
- struct kvm_mmu_page *sp;
- u64 old_child_spte;
- int i;
WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
@@ -298,15 +435,19 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
*/
if (!was_present && !is_present) {
/*
- * If this change does not involve a MMIO SPTE, it is
- * unexpected. Log the change, though it should not impact the
- * guest since both the former and current SPTEs are nonpresent.
+ * If this change does not involve a MMIO SPTE or removed SPTE,
+ * it is unexpected. Log the change, though it should not
+ * impact the guest since both the former and current SPTEs
+ * are nonpresent.
*/
- if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte)))
+ if (WARN_ON(!is_mmio_spte(old_spte) &&
+ !is_mmio_spte(new_spte) &&
+ !is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
"different nonpresent SPTE, unless one or both\n"
- "are MMIO SPTEs.\n"
+ "are MMIO SPTEs, or the new SPTE is\n"
+ "a temporary removed SPTE.\n"
"as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
as_id, gfn, old_spte, new_spte, level);
return;
@@ -321,59 +462,124 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Recursively handle child PTs if the change removed a subtree from
* the paging structure.
*/
- if (was_present && !was_leaf && (pfn_changed || !is_present)) {
- pt = spte_to_child_pt(old_spte, level);
- sp = sptep_to_sp(pt);
+ if (was_present && !was_leaf && (pfn_changed || !is_present))
+ handle_removed_tdp_mmu_page(kvm,
+ spte_to_child_pt(old_spte, level), shared);
+}
- trace_kvm_mmu_prepare_zap_page(sp);
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
+{
+ __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
+ shared);
+ handle_changed_spte_acc_track(old_spte, new_spte, level);
+ handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
+ new_spte, level);
+}
- list_del(&sp->link);
+/*
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically and handle the
+ * associated bookkeeping
+ *
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ * this function will have no side-effects.
+ */
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
+{
+ lockdep_assert_held_read(&kvm->mmu_lock);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ /*
+ * Do not change removed SPTEs. Only the thread that froze the SPTE
+ * may modify it.
+ */
+ if (iter->old_spte == REMOVED_SPTE)
+ return false;
- for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- old_child_spte = READ_ONCE(*(pt + i));
- WRITE_ONCE(*(pt + i), 0);
- handle_changed_spte(kvm, as_id,
- gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)),
- old_child_spte, 0, level - 1);
- }
+ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
+ new_spte) != iter->old_spte)
+ return false;
- kvm_flush_remote_tlbs_with_address(kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
- free_page((unsigned long)pt);
- kmem_cache_free(mmu_page_header_cache, sp);
- }
+ return true;
}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
+static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter)
{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ /*
+ * Freeze the SPTE by setting it to a special,
+ * non-present value. This will stop other threads from
+ * immediately installing a present entry in its place
+ * before the TLBs are flushed.
+ */
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ return false;
+
+ kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
+ KVM_PAGES_PER_HPAGE(iter->level));
+
+ /*
+ * No other thread can overwrite the removed SPTE as they
+ * must either wait on the MMU lock or use
+ * tdp_mmu_set_spte_atomic which will not overrite the
+ * special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present
+ * to non-present.
+ */
+ WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
+
+ return true;
}
+
+/*
+ * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * @kvm: kvm instance
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ * @record_acc_track: Notify the MM subsystem of changes to the accessed state
+ * of the page. Should be set unless handling an MMU
+ * notifier for access tracking. Leaving record_acc_track
+ * unset in that case prevents page accesses from being
+ * double counted.
+ * @record_dirty_log: Record the page as dirty in the dirty bitmap if
+ * appropriate for the change being made. Should be set
+ * unless performing certain dirty logging operations.
+ * Leaving record_dirty_log unset in that case prevents page
+ * writes from being double counted.
+ */
static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
u64 new_spte, bool record_acc_track,
bool record_dirty_log)
{
- u64 *root_pt = tdp_iter_root_pt(iter);
- struct kvm_mmu_page *root = sptep_to_sp(root_pt);
- int as_id = kvm_mmu_page_as_id(root);
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * No thread should be using this function to set SPTEs to the
+ * temporary removed SPTE value.
+ * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
+ * should be used. If operating under the MMU lock in write mode, the
+ * use of the removed SPTE should not be necessary.
+ */
+ WARN_ON(iter->old_spte == REMOVED_SPTE);
- WRITE_ONCE(*iter->sptep, new_spte);
+ WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
- __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte,
- iter->level);
+ __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, false);
if (record_acc_track)
handle_changed_spte_acc_track(iter->old_spte, new_spte,
iter->level);
if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, iter->gfn,
+ handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
iter->old_spte, new_spte,
iter->level);
}
@@ -413,27 +619,44 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
_mmu->shadow_root_level, _start, _end)
/*
- * Flush the TLB if the process should drop kvm->mmu_lock.
- * Return whether the caller still needs to flush the tlb.
+ * Yield if the MMU lock is contended or this thread needs to return control
+ * to the scheduler.
+ *
+ * If this function should yield and flush is set, it will perform a remote
+ * TLB flush before yielding.
+ *
+ * If this function yields, it will also reset the tdp_iter's walk over the
+ * paging structure and the calling function should skip to the next
+ * iteration to allow the iterator to continue its traversal from the
+ * paging structure root.
+ *
+ * Return true if this function yielded and the iterator's traversal was reset.
+ * Return false if a yield was not needed.
*/
-static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
+static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm,
+ struct tdp_iter *iter, bool flush)
{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- kvm_flush_remote_tlbs(kvm);
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
+ /* Ensure forward progress has been made before yielding. */
+ if (iter->next_last_level_gfn == iter->yielded_gfn)
return false;
- } else {
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ rcu_read_unlock();
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ rcu_read_lock();
+
+ WARN_ON(iter->gfn > iter->next_last_level_gfn);
+
+ tdp_iter_restart(iter);
+
return true;
}
-}
-static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter)
-{
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- cond_resched_lock(&kvm->mmu_lock);
- tdp_iter_refresh_walk(iter);
- }
+ return false;
}
/*
@@ -453,7 +676,15 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
bool flush_needed = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (can_yield &&
+ tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) {
+ flush_needed = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte))
continue;
@@ -468,12 +699,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
-
- if (can_yield)
- flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
- else
- flush_needed = true;
+ flush_needed = true;
}
+
+ rcu_read_unlock();
return flush_needed;
}
@@ -517,21 +746,18 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
int ret = 0;
int make_spte_ret = 0;
- if (unlikely(is_noslot_pfn(pfn))) {
+ if (unlikely(is_noslot_pfn(pfn)))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
- trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte);
- } else {
+ else
make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
pfn, iter->old_spte, prefault, true,
map_writable, !shadow_accessed_mask,
&new_spte);
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
- }
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else
- tdp_mmu_set_spte(vcpu->kvm, iter, new_spte);
+ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ return RET_PF_RETRY;
/*
* If the page fault was caused by a write but the page is write
@@ -545,10 +771,16 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte)))
+ if (unlikely(is_mmio_spte(new_spte))) {
+ trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
+ new_spte);
ret = RET_PF_EMULATE;
+ } else
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
- trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep);
+ trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+ rcu_dereference(iter->sptep));
if (!prefault)
vcpu->stat.pf_fixed++;
@@ -586,6 +818,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
huge_page_disallowed, &req_level);
trace_kvm_mmu_spte_requested(gpa, level, pfn);
+
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
if (nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(iter.old_spte, gfn,
@@ -601,49 +836,61 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
*/
if (is_shadow_present_pte(iter.old_spte) &&
is_large_pte(iter.old_spte)) {
- tdp_mmu_set_spte(vcpu->kvm, &iter, 0);
-
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn,
- KVM_PAGES_PER_HPAGE(iter.level));
+ if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
+ break;
/*
* The iter must explicitly re-read the spte here
* because the new value informs the !present
* path below.
*/
- iter.old_spte = READ_ONCE(*iter.sptep);
+ iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
}
if (!is_shadow_present_pte(iter.old_spte)) {
sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level);
- list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages);
child_pt = sp->spt;
- clear_page(child_pt);
+
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- trace_kvm_mmu_get_page(sp, true);
- if (huge_page_disallowed && req_level >= iter.level)
- account_huge_nx_page(vcpu->kvm, sp);
-
- tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte);
+ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
+ new_spte)) {
+ tdp_mmu_link_page(vcpu->kvm, sp, true,
+ huge_page_disallowed &&
+ req_level >= iter.level);
+
+ trace_kvm_mmu_get_page(sp, true);
+ } else {
+ tdp_mmu_free_sp(sp);
+ break;
+ }
}
}
- if (WARN_ON(iter.level != level))
+ if (iter.level != level) {
+ rcu_read_unlock();
return RET_PF_RETRY;
+ }
ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
pfn, prefault);
+ rcu_read_unlock();
return ret;
}
-static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start,
- unsigned long end, unsigned long data,
- int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot,
- struct kvm_mmu_page *root, gfn_t start,
- gfn_t end, unsigned long data))
+static __always_inline int
+kvm_tdp_mmu_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_mmu_page *root,
+ gfn_t start,
+ gfn_t end,
+ unsigned long data))
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -705,6 +952,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
int young = 0;
u64 new_spte = 0;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
/*
* If we have a non-accessed entry we don't need to change the
@@ -736,6 +985,8 @@ static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot,
trace_kvm_age_page(iter.gfn, iter.level, slot, young);
}
+ rcu_read_unlock();
+
return young;
}
@@ -781,6 +1032,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
u64 new_spte;
int need_flush = 0;
+ rcu_read_lock();
+
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
@@ -809,6 +1062,8 @@ static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot,
if (need_flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ rcu_read_unlock();
+
return 0;
}
@@ -832,21 +1087,27 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
min_level, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (!is_shadow_present_pte(iter.old_spte) ||
- !is_last_spte(iter.old_spte, iter.level))
+ !is_last_spte(iter.old_spte, iter.level) ||
+ !(iter.old_spte & PT_WRITABLE_MASK))
continue;
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -888,7 +1149,12 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -903,9 +1169,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
}
+
+ rcu_read_unlock();
return spte_set;
}
@@ -947,6 +1213,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
struct tdp_iter iter;
u64 new_spte;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
gfn + BITS_PER_LONG) {
if (!mask)
@@ -956,6 +1224,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
!(mask & (1UL << (iter.gfn - gfn))))
continue;
+ mask &= ~(1UL << (iter.gfn - gfn));
+
if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
@@ -969,9 +1239,9 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
}
tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
-
- mask &= ~(1UL << (iter.gfn - gfn));
}
+
+ rcu_read_unlock();
}
/*
@@ -989,7 +1259,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_mmu_page *root;
int root_as_id;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1000,81 +1270,43 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
/*
- * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
- * only used for PML, and so will involve setting the dirty bit on each SPTE.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
- */
-static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
- gfn_t start, gfn_t end)
-{
- struct tdp_iter iter;
- u64 new_spte;
- bool spte_set = false;
-
- tdp_root_for_each_pte(iter, root, start, end) {
- if (!is_shadow_present_pte(iter.old_spte))
- continue;
-
- new_spte = iter.old_spte | shadow_dirty_mask;
-
- tdp_mmu_set_spte(kvm, &iter, new_spte);
- spte_set = true;
-
- tdp_mmu_iter_cond_resched(kvm, &iter);
- }
-
- return spte_set;
-}
-
-/*
- * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is
- * only used for PML, and so will involve setting the dirty bit on each SPTE.
- * Returns true if an SPTE has been changed and the TLBs need to be flushed.
- */
-bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
- struct kvm_mmu_page *root;
- int root_as_id;
- bool spte_set = false;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root) {
- root_as_id = kvm_mmu_page_as_id(root);
- if (root_as_id != slot->as_id)
- continue;
-
- spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn,
- slot->base_gfn + slot->npages);
- }
- return spte_set;
-}
-
-/*
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
- gfn_t start, gfn_t end)
+ struct kvm_memory_slot *slot)
{
+ gfn_t start = slot->base_gfn;
+ gfn_t end = start + slot->npages;
struct tdp_iter iter;
kvm_pfn_t pfn;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_pte(iter, root, start, end) {
+ if (tdp_mmu_iter_cond_resched(kvm, &iter, spte_set)) {
+ spte_set = false;
+ continue;
+ }
+
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
continue;
pfn = spte_to_pfn(iter.old_spte);
if (kvm_is_reserved_pfn(pfn) ||
- !PageTransCompoundMap(pfn_to_page(pfn)))
+ iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
+ pfn, PG_LEVEL_NUM))
continue;
tdp_mmu_set_spte(kvm, &iter, 0);
- spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter);
+ spte_set = true;
}
+ rcu_read_unlock();
if (spte_set)
kvm_flush_remote_tlbs(kvm);
}
@@ -1084,7 +1316,7 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
* be replaced by large mappings, for GFNs within the slot.
*/
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot)
+ struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
int root_as_id;
@@ -1094,8 +1326,7 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (root_as_id != slot->as_id)
continue;
- zap_collapsible_spte_range(kvm, root, slot->base_gfn,
- slot->base_gfn + slot->npages);
+ zap_collapsible_spte_range(kvm, root, slot);
}
}
@@ -1111,6 +1342,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
u64 new_spte;
bool spte_set = false;
+ rcu_read_lock();
+
tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) {
if (!is_writable_pte(iter.old_spte))
break;
@@ -1122,6 +1355,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
spte_set = true;
}
+ rcu_read_unlock();
+
return spte_set;
}
@@ -1137,7 +1372,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int root_as_id;
bool spte_set = false;
- lockdep_assert_held(&kvm->mmu_lock);
+ lockdep_assert_held_write(&kvm->mmu_lock);
for_each_tdp_mmu_root(kvm, root) {
root_as_id = kvm_mmu_page_as_id(root);
if (root_as_id != slot->as_id)
@@ -1162,10 +1397,14 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->shadow_root_level;
+ rcu_read_lock();
+
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
+ rcu_read_unlock();
+
return leaf;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index cbbdbadd1526..3b761c111bff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,10 +5,6 @@
#include <linux/kvm_host.h>
-void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-
-bool is_tdp_mmu_root(struct kvm *kvm, hpa_t root);
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root);
@@ -37,9 +33,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
-bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *slot);
+ struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
@@ -47,4 +42,32 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+#ifdef CONFIG_X86_64
+void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
+#else
+static inline void kvm_mmu_init_tdp_mmu(struct kvm *kvm) {}
+static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
+#endif
+
+static inline bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!is_tdp_mmu_enabled(kvm))
+ return false;
+ if (WARN_ON(!VALID_PAGE(hpa)))
+ return false;
+
+ sp = to_shadow_page(hpa);
+ if (WARN_ON(!sp))
+ return false;
+
+ return is_tdp_mmu_page(sp) && sp->root_count;
+}
+
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index f472fdb6ae7e..a8502e02f479 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -75,7 +75,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* variable MTRRs */
WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
- mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
if ((msr & 1) == 0) {
/* MTRR base */
if (!valid_mtrr_type(data & 0xff))
@@ -351,14 +351,14 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (var_mtrr_range_is_valid(cur))
list_del(&mtrr_state->var_ranges[index].node);
- /* Extend the mask with all 1 bits to the left, since those
- * bits must implicitly be 0. The bits are then cleared
- * when reading them.
+ /*
+ * Set all illegal GPA bits in the mask, since those bits must
+ * implicitly be 0. The bits are then cleared when reading them.
*/
if (!is_mtrr_mask)
cur->base = data;
else
- cur->mask = data | (-1LL << cpuid_maxphyaddr(vcpu));
+ cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
/* add it to the list if it's enabled. */
if (var_mtrr_range_is_valid(cur)) {
@@ -426,7 +426,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
else
*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
- *pdata &= (1ULL << cpuid_maxphyaddr(vcpu)) - 1;
+ *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
}
return 0;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 67741d2a0308..827886c12c16 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -373,7 +373,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
return 1;
if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
- (kvm_x86_ops.get_cpl(vcpu) != 0) &&
+ (static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
(kvm_read_cr0(vcpu) & X86_CR0_PE))
return 1;
@@ -383,8 +383,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- if (lapic_in_kernel(vcpu))
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_x86_ops.pmu_ops->deliver_pmi)
+ kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+ }
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -473,6 +476,9 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmc_stop_counter(pmc);
}
+ if (kvm_x86_ops.pmu_ops->cleanup)
+ kvm_x86_ops.pmu_ops->cleanup(vcpu);
+
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 067fef51760c..7b30bc967af3 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
void (*refresh)(struct kvm_vcpu *vcpu);
void (*init)(struct kvm_vcpu *vcpu);
void (*reset)(struct kvm_vcpu *vcpu);
+ void (*deliver_pmi)(struct kvm_vcpu *vcpu);
+ void (*cleanup)(struct kvm_vcpu *vcpu);
};
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0ef84d57b72e..78bdcfac4e40 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -298,6 +298,23 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ bool m = kvm_apic_match_dest(vcpu, source,
+ icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK);
+
+ if (m && !avic_vcpu_is_running(vcpu))
+ kvm_vcpu_wake_up(vcpu);
+ }
+}
+
int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
{
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
@@ -324,28 +341,14 @@ int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
- case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
- struct kvm_lapic *apic = svm->vcpu.arch.apic;
-
+ case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING:
/*
* At this point, we expect that the AVIC HW has already
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
+ avic_kick_target_vcpus(svm->vcpu.kvm, apic, icrl, icrh);
break;
- }
case AVIC_IPI_FAILURE_INVALID_TARGET:
WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
index, svm->vcpu.vcpu_id, icrh, icrl);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index db30670dd8c4..35891d9a1099 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -51,6 +51,23 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
nested_svm_vmexit(svm);
}
+static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ WARN_ON(!is_guest_mode(vcpu));
+
+ if (vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
+ !svm->nested.nested_run_pending) {
+ svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = fault->error_code;
+ svm->vmcb->control.exit_info_2 = fault->address;
+ nested_svm_vmexit(svm);
+ } else {
+ kvm_inject_page_fault(vcpu, fault);
+ }
+}
+
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -58,7 +75,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -248,7 +265,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12)
if (vmcb12_lma) {
if (!(vmcb12->save.cr4 & X86_CR4_PAE) ||
!(vmcb12->save.cr0 & X86_CR0_PE) ||
- (vmcb12->save.cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ kvm_vcpu_is_illegal_gpa(vcpu, vmcb12->save.cr3))
return false;
}
if (!kvm_is_valid_cr4(&svm->vcpu, vmcb12->save.cr4))
@@ -345,7 +362,7 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
bool nested_npt)
{
- if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))
+ if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return -EINVAL;
if (!nested_npt && is_pae_paging(vcpu) &&
@@ -392,7 +409,7 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
svm->vmcb->save.rsp = vmcb12->save.rsp;
svm->vmcb->save.rip = vmcb12->save.rip;
svm->vmcb->save.dr7 = vmcb12->save.dr7 | DR7_FIXED_1;
- svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_FIXED_1 | DR6_RTM;
+ svm->vcpu.arch.dr6 = vmcb12->save.dr6 | DR6_ACTIVE_LOW;
svm->vmcb->save.cpl = vmcb12->save.cpl;
}
@@ -436,16 +453,33 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa,
{
int ret;
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
+ vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
+ vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
+ vmcb12->control.intercepts[INTERCEPT_WORD3],
+ vmcb12->control.intercepts[INTERCEPT_WORD4],
+ vmcb12->control.intercepts[INTERCEPT_WORD5]);
+
+
svm->nested.vmcb12_gpa = vmcb12_gpa;
load_nested_vmcb_control(svm, &vmcb12->control);
- nested_prepare_vmcb_save(svm, vmcb12);
nested_prepare_vmcb_control(svm);
+ nested_prepare_vmcb_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
nested_npt_enabled(svm));
if (ret)
return ret;
+ if (!npt_enabled)
+ svm->vcpu.arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+
svm_set_gif(svm, true);
return 0;
@@ -489,18 +523,6 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
goto out;
}
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
- vmcb12->save.rip,
- vmcb12->control.int_ctl,
- vmcb12->control.event_inj,
- vmcb12->control.nested_ctl);
-
- trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
- vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
- vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
- vmcb12->control.intercepts[INTERCEPT_WORD3],
- vmcb12->control.intercepts[INTERCEPT_WORD4],
- vmcb12->control.intercepts[INTERCEPT_WORD5]);
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..874ea309279f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -22,6 +22,7 @@
#include "x86.h"
#include "svm.h"
+#include "svm_ops.h"
#include "cpuid.h"
#include "trace.h"
@@ -1041,6 +1042,74 @@ e_unpin_memory:
return ret;
}
+static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ void __user *report = (void __user *)(uintptr_t)argp->data;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct sev_data_attestation_report *data;
+ struct kvm_sev_attestation_report params;
+ void __user *p;
+ void *blob = NULL;
+ int ret;
+
+ if (!sev_guest(kvm))
+ return -ENOTTY;
+
+ if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ return -EFAULT;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
+ if (!data)
+ return -ENOMEM;
+
+ /* User wants to query the blob length */
+ if (!params.len)
+ goto cmd;
+
+ p = (void __user *)(uintptr_t)params.uaddr;
+ if (p) {
+ if (params.len > SEV_FW_BLOB_MAX_SIZE) {
+ ret = -EINVAL;
+ goto e_free;
+ }
+
+ ret = -ENOMEM;
+ blob = kmalloc(params.len, GFP_KERNEL);
+ if (!blob)
+ goto e_free;
+
+ data->address = __psp_pa(blob);
+ data->len = params.len;
+ memcpy(data->mnonce, params.mnonce, sizeof(params.mnonce));
+ }
+cmd:
+ data->handle = sev->handle;
+ ret = sev_issue_cmd(kvm, SEV_CMD_ATTESTATION_REPORT, data, &argp->error);
+ /*
+ * If we query the session length, FW responded with expected data.
+ */
+ if (!params.len)
+ goto done;
+
+ if (ret)
+ goto e_free_blob;
+
+ if (blob) {
+ if (copy_to_user(p, blob, params.len))
+ ret = -EFAULT;
+ }
+
+done:
+ params.len = data->len;
+ if (copy_to_user(report, &params, sizeof(params)))
+ ret = -EFAULT;
+e_free_blob:
+ kfree(blob);
+e_free:
+ kfree(data);
+ return ret;
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1091,6 +1160,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
case KVM_SEV_LAUNCH_SECRET:
r = sev_launch_secret(kvm, &sev_cmd);
break;
+ case KVM_SEV_GET_ATTESTATION_REPORT:
+ r = sev_get_attestation_report(kvm, &sev_cmd);
+ break;
default:
r = -EINVAL;
goto out;
@@ -1994,29 +2066,17 @@ void sev_es_create_vcpu(struct vcpu_svm *svm)
sev_enc_bit));
}
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
struct vmcb_save_area *hostsa;
- unsigned int i;
/*
* As an SEV-ES guest, hardware will restore the host state on VMEXIT,
* of which one step is to perform a VMLOAD. Since hardware does not
* perform a VMSAVE on VMRUN, the host savearea must be updated.
*/
- asm volatile(__ex("vmsave %0") : : "a" (__sme_page_pa(sd->save_area)) : "memory");
-
- /*
- * Certain MSRs are restored on VMEXIT, only save ones that aren't
- * restored.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- rdmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
+ vmsave(__sme_page_pa(sd->save_area));
/* XCR0 is restored on VMEXIT, save the current host value */
hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
@@ -2029,22 +2089,6 @@ void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu)
hostsa->xss = host_xss;
}
-void sev_es_vcpu_put(struct vcpu_svm *svm)
-{
- unsigned int i;
-
- /*
- * Certain MSRs are restored on VMEXIT and were saved with vmsave in
- * sev_es_vcpu_load() above. Only restore ones that weren't.
- */
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) {
- if (host_save_user_msrs[i].sev_es_restored)
- continue;
-
- wrmsrl(host_save_user_msrs[i].index, svm->host_user_msrs[i]);
- }
-}
-
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct vcpu_svm *svm = to_svm(vcpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 3442d44ca53b..58a45bb139f8 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "svm.h"
+#include "svm_ops.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -114,13 +115,6 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_INVALID, .always = false },
};
-/* enable NPT for AMD64 and X86 with PAE */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-bool npt_enabled = true;
-#else
-bool npt_enabled;
-#endif
-
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* pause_filter_count: On processors that support Pause filtering(indicated
@@ -169,9 +163,12 @@ module_param(pause_filter_count_shrink, ushort, 0444);
static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
module_param(pause_filter_count_max, ushort, 0444);
-/* allow nested paging (virtualized MMU) for all guests */
-static int npt = true;
-module_param(npt, int, S_IRUGO);
+/*
+ * Use nested page tables by default. Note, NPT may get forced off by
+ * svm_hardware_setup() if it's unsupported by hardware or the host kernel.
+ */
+bool npt_enabled = true;
+module_param_named(npt, npt_enabled, bool, 0444);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
@@ -200,9 +197,9 @@ module_param(sev_es, int, 0444);
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
-static u8 rsm_ins_bytes[] = "\x0f\xaa";
+static bool svm_gp_erratum_intercept = true;
-static void svm_complete_interrupts(struct vcpu_svm *svm);
+static u8 rsm_ins_bytes[] = "\x0f\xaa";
static unsigned long iopm_base;
@@ -246,21 +243,6 @@ u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline void clgi(void)
-{
- asm volatile (__ex("clgi"));
-}
-
-static inline void stgi(void)
-{
- asm volatile (__ex("stgi"));
-}
-
-static inline void invlpga(unsigned long addr, u32 asid)
-{
- asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
-}
-
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -288,6 +270,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if (!(efer & EFER_SVME)) {
svm_leave_nested(svm);
svm_set_gif(svm, true);
+ /* #GP intercept is still needed for vmware backdoor */
+ if (!enable_vmware_backdoor)
+ clr_exception_intercept(svm, GP_VECTOR);
/*
* Free the nested guest state, unless we are in SMM.
@@ -304,6 +289,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.efer = old_efer;
return ret;
}
+
+ if (svm_gp_erratum_intercept)
+ set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -925,15 +913,15 @@ static __init void svm_set_cpu_caps(void)
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-
- /* Enable INVPCID feature */
- kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
}
static __init int svm_hardware_setup(void)
@@ -996,10 +984,15 @@ static __init int svm_hardware_setup(void)
goto err;
}
- if (!boot_cpu_has(X86_FEATURE_NPT))
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
npt_enabled = false;
- if (npt_enabled && !npt)
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
@@ -1032,6 +1025,9 @@ static __init int svm_hardware_setup(void)
}
}
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
if (vgif) {
if (!boot_cpu_has(X86_FEATURE_VGIF))
vgif = false;
@@ -1105,12 +1101,12 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
static void svm_check_invpcid(struct vcpu_svm *svm)
{
/*
- * Intercept INVPCID instruction only if shadow page table is
- * enabled. Interception is not required with nested page table
- * enabled.
+ * Intercept INVPCID if shadow paging is enabled to sync/free shadow
+ * roots, or if INVPCID is disabled in the guest to inject #UD.
*/
if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) {
- if (!npt_enabled)
+ if (!npt_enabled ||
+ !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID))
svm_set_intercept(svm, INTERCEPT_INVPCID);
else
svm_clr_intercept(svm, INTERCEPT_INVPCID);
@@ -1205,9 +1201,10 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
+ svm_set_cr4(&svm->vcpu, 0);
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
- kvm_set_rflags(&svm->vcpu, 2);
+ kvm_set_rflags(&svm->vcpu, X86_EFLAGS_FIXED);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
@@ -1366,6 +1363,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->vmsa = page_address(vmsa_page);
svm->asid_generation = 0;
+ svm->guest_state_loaded = false;
init_vmcb(svm);
svm_init_osvw(vcpu);
@@ -1413,30 +1411,31 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
}
-static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- int i;
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+ unsigned int i;
- if (unlikely(cpu != vcpu->cpu)) {
- svm->asid_generation = 0;
- vmcb_mark_all_dirty(svm->vmcb);
- }
+ if (svm->guest_state_loaded)
+ return;
+
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Save ones that aren't so we can restore them
+ * individually later.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+ /*
+ * Save additional host state that will be restored on VMEXIT (sev-es)
+ * or subsequent vmload of host save area.
+ */
if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_load(svm, cpu);
+ sev_es_prepare_guest_switch(svm, vcpu->cpu);
} else {
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
-#endif
- savesegment(fs, svm->host.fs);
- savesegment(gs, svm->host.gs);
- svm->host.ldt = kvm_read_ldt();
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- rdmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
+ vmsave(__sme_page_pa(sd->save_area));
}
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
@@ -1446,10 +1445,42 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
}
}
+
/* This assumes that the kernel never uses MSR_TSC_AUX */
if (static_cpu_has(X86_FEATURE_RDTSCP))
wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ svm->guest_state_loaded = true;
+}
+
+static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ unsigned int i;
+
+ if (!svm->guest_state_loaded)
+ return;
+
+ /*
+ * Certain MSRs are restored on VMEXIT (sev-es), or vmload of host save
+ * area (non-sev-es). Restore the ones that weren't.
+ */
+ for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
+ wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
+
+ svm->guest_state_loaded = false;
+}
+
+static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
+
+ if (unlikely(cpu != vcpu->cpu)) {
+ svm->asid_generation = 0;
+ vmcb_mark_all_dirty(svm->vmcb);
+ }
+
if (sd->current_vmcb != svm->vmcb) {
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
@@ -1459,30 +1490,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- int i;
-
avic_vcpu_put(vcpu);
+ svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
- if (sev_es_guest(svm->vcpu.kvm)) {
- sev_es_vcpu_put(svm);
- } else {
- kvm_load_ldt(svm->host.ldt);
-#ifdef CONFIG_X86_64
- loadsegment(fs, svm->host.fs);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
- load_gs_index(svm->host.gs);
-#else
-#ifdef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
-
- for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
- wrmsrl(host_save_user_msrs[i].index,
- svm->host_user_msrs[i]);
- }
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1815,7 +1826,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
}
-static void update_exception_bitmap(struct kvm_vcpu *vcpu)
+static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1865,7 +1876,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
get_debugreg(vcpu->arch.db[2], 2);
get_debugreg(vcpu->arch.db[3], 3);
/*
- * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here,
+ * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here,
* because db_interception might need it. We can do it before vmentry.
*/
vcpu->arch.dr6 = svm->vmcb->save.dr6;
@@ -1916,7 +1927,7 @@ static int db_interception(struct vcpu_svm *svm)
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
- u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1;
+ u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW;
kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload);
return 1;
}
@@ -1962,24 +1973,6 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
-static int gp_interception(struct vcpu_svm *svm)
-{
- struct kvm_vcpu *vcpu = &svm->vcpu;
- u32 error_code = svm->vmcb->control.exit_info_1;
-
- WARN_ON_ONCE(!enable_vmware_backdoor);
-
- /*
- * VMware backdoor emulation on #GP interception only handles IN{S},
- * OUT{S}, and RDPMC, none of which generate a non-zero error code.
- */
- if (error_code) {
- kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
- }
- return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
-}
-
static bool is_erratum_383(void)
{
int err, i;
@@ -2178,6 +2171,107 @@ static int vmrun_interception(struct vcpu_svm *svm)
return nested_svm_vmrun(svm);
}
+enum {
+ NONE_SVM_INSTR,
+ SVM_INSTR_VMRUN,
+ SVM_INSTR_VMLOAD,
+ SVM_INSTR_VMSAVE,
+};
+
+/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */
+static int svm_instr_opcode(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ if (ctxt->b != 0x1 || ctxt->opcode_len != 2)
+ return NONE_SVM_INSTR;
+
+ switch (ctxt->modrm) {
+ case 0xd8: /* VMRUN */
+ return SVM_INSTR_VMRUN;
+ case 0xda: /* VMLOAD */
+ return SVM_INSTR_VMLOAD;
+ case 0xdb: /* VMSAVE */
+ return SVM_INSTR_VMSAVE;
+ default:
+ break;
+ }
+
+ return NONE_SVM_INSTR;
+}
+
+static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode)
+{
+ const int guest_mode_exit_codes[] = {
+ [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN,
+ [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD,
+ [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE,
+ };
+ int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = {
+ [SVM_INSTR_VMRUN] = vmrun_interception,
+ [SVM_INSTR_VMLOAD] = vmload_interception,
+ [SVM_INSTR_VMSAVE] = vmsave_interception,
+ };
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ svm->vmcb->control.exit_code = guest_mode_exit_codes[opcode];
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ /* Returns '1' or -errno on failure, '0' on success. */
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ return svm_instr_handlers[opcode](svm);
+}
+
+/*
+ * #GP handling code. Note that #GP can be triggered under the following two
+ * cases:
+ * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on
+ * some AMD CPUs when EAX of these instructions are in the reserved memory
+ * regions (e.g. SMM memory on host).
+ * 2) VMware backdoor
+ */
+static int gp_interception(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ u32 error_code = svm->vmcb->control.exit_info_1;
+ int opcode;
+
+ /* Both #GP cases have zero error_code */
+ if (error_code)
+ goto reinject;
+
+ /* Decode the instruction for usage later */
+ if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
+ goto reinject;
+
+ opcode = svm_instr_opcode(vcpu);
+
+ if (opcode == NONE_SVM_INSTR) {
+ if (!enable_vmware_backdoor)
+ goto reinject;
+
+ /*
+ * VMware backdoor emulation on #GP interception only handles
+ * IN{S}, OUT{S}, and RDPMC.
+ */
+ if (!is_guest_mode(vcpu))
+ return kvm_emulate_instruction(vcpu,
+ EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
+ } else
+ return emulate_svm_instr(vcpu, opcode);
+
+reinject:
+ kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
+ return 1;
+}
+
void svm_set_gif(struct vcpu_svm *svm, bool value)
{
if (value) {
@@ -2265,11 +2359,8 @@ static int xsetbv_interception(struct vcpu_svm *svm)
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
u32 index = kvm_rcx_read(&svm->vcpu);
- if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
-
- return 1;
+ int err = kvm_set_xcr(&svm->vcpu, index, new_bv);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int rdpru_interception(struct vcpu_svm *svm)
@@ -2530,6 +2621,7 @@ static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
unsigned long val;
+ int err = 0;
if (svm->vcpu.guest_debug == 0) {
/*
@@ -2547,20 +2639,16 @@ static int dr_interception(struct vcpu_svm *svm)
reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
-
- if (dr >= 16) { /* mov to DRn */
- if (!kvm_require_dr(&svm->vcpu, dr - 16))
- return 1;
+ if (dr >= 16) { /* mov to DRn */
+ dr -= 16;
val = kvm_register_read(&svm->vcpu, reg);
- kvm_set_dr(&svm->vcpu, dr - 16, val);
+ err = kvm_set_dr(&svm->vcpu, dr, val);
} else {
- if (!kvm_require_dr(&svm->vcpu, dr))
- return 1;
kvm_get_dr(&svm->vcpu, dr, &val);
kvm_register_write(&svm->vcpu, reg, val);
}
- return kvm_skip_emulated_instruction(&svm->vcpu);
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -3354,7 +3442,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3479,7 +3567,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !svm_interrupt_blocked(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3503,7 +3591,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
}
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3560,10 +3648,6 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
invlpga(gva, svm->vmcb->control.asid);
}
-static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
-{
-}
-
static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3708,16 +3792,11 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if (sev_es_guest(svm->vcpu.kvm)) {
__svm_sev_es_vcpu_run(svm->vmcb_pa);
} else {
+ struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu);
+
__svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs);
-#ifdef CONFIG_X86_64
- native_wrmsrl(MSR_GS_BASE, svm->host.gs_base);
-#else
- loadsegment(fs, svm->host.fs);
-#ifndef CONFIG_X86_32_LAZY_GS
- loadsegment(gs, svm->host.gs);
-#endif
-#endif
+ vmload(__sme_page_pa(sd->save_area));
}
/*
@@ -3783,7 +3862,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
svm_set_dr6(svm, vcpu->arch.dr6);
else
- svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM);
+ svm_set_dr6(svm, DR6_ACTIVE_LOW);
clgi();
kvm_load_guest_xsave_state(vcpu);
@@ -3978,7 +4057,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0);
if (best)
- vcpu->arch.cr3_lm_rsvd_bits &= ~(1UL << (best->ebx & 0x3f));
+ vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f));
}
if (!kvm_vcpu_apicv_active(vcpu))
@@ -4285,7 +4364,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return ret;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4439,7 +4518,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
@@ -4482,9 +4561,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.nmi_allowed = svm_nmi_allowed,
.get_nmi_mask = svm_get_nmi_mask,
.set_nmi_mask = svm_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = svm_enable_nmi_window,
+ .enable_irq_window = svm_enable_irq_window,
+ .update_cr8_intercept = svm_update_cr8_intercept,
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
@@ -4527,7 +4606,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.smi_allowed = svm_smi_allowed,
.pre_enter_smm = svm_pre_enter_smm,
.pre_leave_smm = svm_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = svm_enable_smi_window,
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6e7d070f8b86..39e071fdab0c 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -23,22 +23,8 @@
#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
-static const struct svm_host_save_msrs {
- u32 index; /* Index of the MSR */
- bool sev_es_restored; /* True if MSR is restored on SEV-ES VMEXIT */
-} host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
- { .index = MSR_STAR, .sev_es_restored = true },
- { .index = MSR_LSTAR, .sev_es_restored = true },
- { .index = MSR_CSTAR, .sev_es_restored = true },
- { .index = MSR_SYSCALL_MASK, .sev_es_restored = true },
- { .index = MSR_KERNEL_GS_BASE, .sev_es_restored = true },
- { .index = MSR_FS_BASE, .sev_es_restored = true },
-#endif
- { .index = MSR_IA32_SYSENTER_CS, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_ESP, .sev_es_restored = true },
- { .index = MSR_IA32_SYSENTER_EIP, .sev_es_restored = true },
- { .index = MSR_TSC_AUX, .sev_es_restored = false },
+static const u32 host_save_user_msrs[] = {
+ MSR_TSC_AUX,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
@@ -130,12 +116,6 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- struct {
- u16 fs;
- u16 gs;
- u16 ldt;
- u64 gs_base;
- } host;
u64 spec_ctrl;
/*
@@ -192,6 +172,8 @@ struct vcpu_svm {
u64 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
+
+ bool guest_state_loaded;
};
struct svm_cpu_data {
@@ -587,9 +569,8 @@ int sev_handle_vmgexit(struct vcpu_svm *svm);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
void sev_es_create_vcpu(struct vcpu_svm *svm);
-void sev_es_vcpu_load(struct vcpu_svm *svm, int cpu);
-void sev_es_vcpu_put(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
/* vmenter.S */
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
new file mode 100644
index 000000000000..8170f2a5a16f
--- /dev/null
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_SVM_OPS_H
+#define __KVM_X86_SVM_OPS_H
+
+#include <linux/compiler_types.h>
+
+#include <asm/kvm_host.h>
+
+#define svm_asm(insn, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) "\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ ::: clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm1(insn, op1, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+#define svm_asm2(insn, op1, op2, clobber...) \
+do { \
+ asm_volatile_goto("1: " __stringify(insn) " %1, %0\n\t" \
+ _ASM_EXTABLE(1b, %l[fault]) \
+ :: op1, op2 : clobber : fault); \
+ return; \
+fault: \
+ kvm_spurious_fault(); \
+} while (0)
+
+static inline void clgi(void)
+{
+ svm_asm(clgi);
+}
+
+static inline void stgi(void)
+{
+ svm_asm(stgi);
+}
+
+static inline void invlpga(unsigned long addr, u32 asid)
+{
+ svm_asm2(invlpga, "c"(asid), "a"(addr));
+}
+
+/*
+ * Despite being a physical address, the portion of rAX that is consumed by
+ * VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
+ * hence 'unsigned long' instead of 'hpa_t'.
+ */
+static inline void vmsave(unsigned long pa)
+{
+ svm_asm1(vmsave, "a" (pa), "memory");
+}
+
+static inline void vmload(unsigned long pa)
+{
+ svm_asm1(vmload, "a" (pa), "memory");
+}
+
+#endif /* __KVM_X86_SVM_OPS_H */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2de30c20bc26..a61c015870e3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -93,6 +93,42 @@ TRACE_EVENT(kvm_hv_hypercall,
);
/*
+ * Tracepoint for Xen hypercall.
+ */
+TRACE_EVENT(kvm_xen_hypercall,
+ TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3, unsigned long a4,
+ unsigned long a5),
+ TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, nr)
+ __field(unsigned long, a0)
+ __field(unsigned long, a1)
+ __field(unsigned long, a2)
+ __field(unsigned long, a3)
+ __field(unsigned long, a4)
+ __field(unsigned long, a5)
+ ),
+
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->a0 = a0;
+ __entry->a1 = a1;
+ __entry->a2 = a2;
+ __entry->a3 = a3;
+ __entry->a4 = a4;
+ __entry->a4 = a5;
+ ),
+
+ TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ __entry->a3, __entry->a4, __entry->a5)
+);
+
+
+
+/*
* Tracepoint for PIO.
*/
@@ -256,7 +292,7 @@ TRACE_EVENT(name, \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
@@ -738,7 +774,7 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS);
+ __entry->csbase = static_call(kvm_x86_get_segment_base)(vcpu, VCPU_SREG_CS);
__entry->len = vcpu->arch.emulate_ctxt->fetch.ptr
- vcpu->arch.emulate_ctxt->fetch.data;
__entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len;
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 3a1861403d73..d1d77985e889 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -19,6 +19,9 @@ extern int __read_mostly pt_mode;
#define PT_MODE_HOST_GUEST 1
#define PMU_CAP_FW_WRITES (1ULL << 13)
+#define PMU_CAP_LBR_FMT 0x3f
+
+#define DEBUGCTLMSR_LBR_MASK (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI)
struct nested_vmx_msrs {
/*
@@ -262,6 +265,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
SECONDARY_EXEC_TSC_SCALING;
}
+static inline bool cpu_has_vmx_bus_lock_detection(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
+}
+
static inline bool cpu_has_vmx_apicv(void)
{
return cpu_has_vmx_apic_register_virt() &&
@@ -371,11 +380,28 @@ static inline bool vmx_pt_mode_is_host_guest(void)
static inline u64 vmx_get_perf_capabilities(void)
{
+ u64 perf_cap = 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
+
+ perf_cap &= PMU_CAP_LBR_FMT;
+
/*
* Since counters are virtualized, KVM would support full
* width counting unconditionally, even if the host lacks it.
*/
- return PMU_CAP_FW_WRITES;
+ return PMU_CAP_FW_WRITES | perf_cap;
+}
+
+static inline u64 vmx_supported_debugctl(void)
+{
+ u64 debugctl = 0;
+
+ if (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)
+ debugctl |= DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
}
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f2b9bfb58206..bcca0b80e0d0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "nested.h"
#include "pmu.h"
#include "trace.h"
+#include "vmx.h"
#include "x86.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -411,8 +412,8 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
if (nr == DB_VECTOR) {
if (!has_payload) {
payload = vcpu->arch.dr6;
- payload &= ~(DR6_FIXED_1 | DR6_BT);
- payload ^= DR6_RTM;
+ payload &= ~DR6_BT;
+ payload ^= DR6_ACTIVE_LOW;
}
*exit_qual = payload;
} else
@@ -744,8 +745,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
(CC(!nested_cpu_has_vid(vmcs12)) ||
CC(!nested_exit_intr_ack_set(vcpu)) ||
CC((vmcs12->posted_intr_nv & 0xff00)) ||
- CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
- CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
+ CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64))))
return -EINVAL;
/* tpr shadow is needed by all apicv features. */
@@ -758,13 +758,11 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
- int maxphyaddr;
-
if (count == 0)
return 0;
- maxphyaddr = cpuid_maxphyaddr(vcpu);
- if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
- (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
+
+ if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
return 0;
@@ -1062,14 +1060,6 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
}
}
-static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
- unsigned long invalid_mask;
-
- invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
- return (val & invalid_mask) == 0;
-}
-
/*
* Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
* tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
@@ -1121,7 +1111,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
enum vm_entry_failure_code *entry_failure_code)
{
- if (CC(!nested_cr3_valid(vcpu, cr3))) {
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -2177,15 +2167,13 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
/*
- * The PML address never changes, so it is constant in vmcs02.
- * Conceptually we want to copy the PML index from vmcs01 here,
- * and then back to vmcs01 on nested vmexit. But since we flush
- * the log and reset GUEST_PML_INDEX on each vmexit, the PML
- * index is also effectively constant in vmcs02.
+ * PML is emulated for L2, but never enabled in hardware as the MMU
+ * handles A/D emulation. Disabling PML for L2 also avoids having to
+ * deal with filtering out L2 GPAs from the buffer.
*/
if (enable_pml) {
- vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write64(PML_ADDRESS, 0);
+ vmcs_write16(GUEST_PML_INDEX, -1);
}
if (cpu_has_vmx_encls_vmexit())
@@ -2220,7 +2208,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{
- u32 exec_control, vmcs12_exec_ctrl;
+ u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
@@ -2294,11 +2282,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC);
if (nested_cpu_has(vmcs12,
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
- vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
- ~SECONDARY_EXEC_ENABLE_PML;
- exec_control |= vmcs12_exec_ctrl;
- }
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+ exec_control |= vmcs12->secondary_vm_exec_control;
+
+ /* PML is emulated and never enabled in hardware for L2. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
/* VMCS shadowing for L2 is emulated for now */
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
@@ -2532,7 +2520,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
*/
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
@@ -2635,7 +2623,6 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
/* Check for memory type validity */
switch (new_eptp & VMX_EPTP_MT_MASK) {
@@ -2666,7 +2653,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
}
/* Reserved bits should not be set */
- if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
+ if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
@@ -2850,7 +2837,7 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
- CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
+ CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3)))
return -EINVAL;
if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
@@ -3057,35 +3044,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- asm(
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "je 1f \n\t"
- __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
- "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Check if vmlaunch or vmresume is needed */
- "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
-
- /*
- * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
- * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
- * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
- * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
- */
- "call vmx_vmenter\n\t"
-
- CC_SET(be)
- : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
- : [HOST_RSP]"r"((unsigned long)HOST_RSP),
- [loaded_vmcs]"r"(vmx->loaded_vmcs),
- [launched]"i"(offsetof(struct loaded_vmcs, launched)),
- [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
- [wordsize]"i"(sizeof(ulong))
- : "memory"
- );
+ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -3330,7 +3290,11 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
bool evaluate_pending_interrupts;
- u32 exit_reason, failed_index;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
@@ -3382,7 +3346,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
if (nested_vmx_check_guest_state(vcpu, vmcs12,
&entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit;
}
@@ -3393,7 +3357,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
- exit_reason = EXIT_REASON_INVALID_STATE;
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3403,7 +3367,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
if (failed_index) {
- exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
vmcs12->exit_qualification = failed_index;
goto vmentry_fail_vmexit_guest_mode;
}
@@ -3471,7 +3435,7 @@ vmentry_fail_vmexit:
return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+ vmcs12->vm_exit_reason = exit_reason.full;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
@@ -4234,9 +4198,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-
nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
@@ -4529,6 +4490,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_set_virtual_apic_mode(vcpu);
}
+ if (vmx->nested.update_vmcs01_cpu_dirty_logging) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = false;
+ vmx_update_cpu_dirty_logging(vcpu);
+ }
+
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_clean(vmx->nested.apic_access_page);
@@ -5559,7 +5525,12 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
fail:
- nested_vmx_vmexit(vcpu, vmx->exit_reason,
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu));
return 1;
@@ -5627,7 +5598,8 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
*/
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12, u32 exit_reason)
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
{
u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
@@ -5641,7 +5613,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
* First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap;
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
bitmap += 2048;
if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
@@ -5778,11 +5750,12 @@ static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
* Return true if L0 wants to handle an exit from L2 regardless of whether or not
* L1 wants the exit. Only call this when in is_guest_mode (L2).
*/
-static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5820,7 +5793,10 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_PREEMPTION_TIMER:
return true;
case EXIT_REASON_PML_FULL:
- /* We emulate PML support to L1. */
+ /*
+ * PML is emulated for an L1 VMM and should never be enabled in
+ * vmcs02, always "handle" PML_FULL by exiting to userspace.
+ */
return true;
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
@@ -5838,12 +5814,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
* Return 1 if L1 wants to intercept an exit from L2. Only call this when in
* is_guest_mode (L2).
*/
-static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
@@ -5962,7 +5939,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason)
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
unsigned long exit_qual;
u32 exit_intr_info;
@@ -5981,7 +5958,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -6007,7 +5984,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
exit_qual = vmx_get_exit_qual(vcpu);
reflect_vmexit:
- nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual);
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
return true;
}
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index cdf5f34518f4..9efc1a6b8693 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -152,12 +152,17 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
return &counters[array_index_nospec(idx, num_counters)];
}
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- return false;
+ return 0;
- return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
}
static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -168,6 +173,41 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ /*
+ * As a first step, a guest could only enable LBR feature if its
+ * cpu model is the same as the host because the LBR registers
+ * would be pass-through to the guest and they're model specific.
+ */
+ return boot_cpu_data.x86_model == guest_cpuid_model(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+
+ return lbr->nr && (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_LBR_FMT);
+}
+
+static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
+{
+ struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
+ bool ret = false;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return ret;
+
+ ret = (index == MSR_LBR_SELECT) || (index == MSR_LBR_TOS) ||
+ (index >= records->from && index < records->from + records->nr) ||
+ (index >= records->to && index < records->to + records->nr);
+
+ if (!ret && records->info)
+ ret = (index >= records->info && index < records->info + records->nr);
+
+ return ret;
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -183,7 +223,8 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr) ||
+ intel_pmu_is_valid_lbr_msr(vcpu, msr);
break;
}
@@ -202,6 +243,111 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
return pmc;
}
+static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->event) {
+ perf_event_release_kernel(lbr_desc->event);
+ lbr_desc->event = NULL;
+ vcpu_to_pmu(vcpu)->event_count--;
+ }
+}
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct perf_event *event;
+
+ /*
+ * The perf_event_attr is constructed in the minimum efficient way:
+ * - set 'pinned = true' to make it task pinned so that if another
+ * cpu pinned event reclaims LBR, the event->oncpu will be set to -1;
+ * - set '.exclude_host = true' to record guest branches behavior;
+ *
+ * - set '.config = INTEL_FIXED_VLBR_EVENT' to indicates host perf
+ * schedule the event without a real HW counter but a fake one;
+ * check is_guest_lbr_event() and __intel_get_event_constraints();
+ *
+ * - set 'sample_type = PERF_SAMPLE_BRANCH_STACK' and
+ * 'branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ * PERF_SAMPLE_BRANCH_USER' to configure it as a LBR callstack
+ * event, which helps KVM to save/restore guest LBR records
+ * during host context switches and reduces quite a lot overhead,
+ * check branch_user_callstack() and intel_pmu_lbr_sched_task();
+ */
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(attr),
+ .config = INTEL_FIXED_VLBR_EVENT,
+ .sample_type = PERF_SAMPLE_BRANCH_STACK,
+ .pinned = true,
+ .exclude_host = true,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
+ PERF_SAMPLE_BRANCH_USER,
+ };
+
+ if (unlikely(lbr_desc->event)) {
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+ }
+
+ event = perf_event_create_kernel_counter(&attr, -1,
+ current, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_debug_ratelimited("%s: failed %ld\n",
+ __func__, PTR_ERR(event));
+ return PTR_ERR(event);
+ }
+ lbr_desc->event = event;
+ pmu->event_count++;
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ return 0;
+}
+
+/*
+ * It's safe to access LBR msrs from guest when they have not
+ * been passthrough since the host would help restore or reset
+ * the LBR msrs records when the guest LBR event is scheduled in.
+ */
+static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info, bool read)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ u32 index = msr_info->index;
+
+ if (!intel_pmu_is_valid_lbr_msr(vcpu, index))
+ return false;
+
+ if (!lbr_desc->event && intel_pmu_create_guest_lbr_event(vcpu) < 0)
+ goto dummy;
+
+ /*
+ * Disable irq to ensure the LBR feature doesn't get reclaimed by the
+ * host at the time the value is read from the msr, and this avoids the
+ * host LBR value to be leaked to the guest. If LBR has been reclaimed,
+ * return 0 on guest reads.
+ */
+ local_irq_disable();
+ if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
+ if (read)
+ rdmsrl(index, msr_info->data);
+ else
+ wrmsrl(index, msr_info->data);
+ __set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+ return true;
+ }
+ clear_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
+ local_irq_enable();
+
+dummy:
+ if (read)
+ msr_info->data = 0;
+ return true;
+}
+
static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -236,7 +382,8 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
return 0;
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
+ return 0;
}
return 1;
@@ -307,7 +454,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
reprogram_gp_counter(pmc, data);
return 0;
}
- }
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
+ return 0;
}
return 1;
@@ -316,6 +464,8 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
@@ -327,7 +477,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
- vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -340,8 +489,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
perf_get_x86_pmu_capability(&x86_pmu);
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
@@ -385,12 +532,21 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
nested_vmx_pmu_entry_exit_ctls_update(vcpu);
+
+ if (intel_pmu_lbr_is_compatible(vcpu))
+ x86_perf_get_lbr(&lbr_desc->records);
+ else
+ lbr_desc->records.nr = 0;
+
+ if (lbr_desc->records.nr)
+ bitmap_set(pmu->all_valid_pmc_idx, INTEL_PMC_IDX_FIXED_VLBR, 1);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
@@ -405,6 +561,11 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
pmu->fixed_counters[i].current_config = 0;
}
+
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
+ lbr_desc->records.nr = 0;
+ lbr_desc->event = NULL;
+ lbr_desc->msr_passthrough = false;
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
@@ -429,6 +590,119 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
pmu->global_ovf_ctrl = 0;
+
+ intel_pmu_release_guest_lbr_event(vcpu);
+}
+
+/*
+ * Emulate LBR_On_PMI behavior for 1 < pmu.version < 4.
+ *
+ * If Freeze_LBR_On_PMI = 1, the LBR is frozen on PMI and
+ * the KVM emulates to clear the LBR bit (bit 0) in IA32_DEBUGCTL.
+ *
+ * Guest needs to re-enable LBR to resume branches recording.
+ */
+static void intel_pmu_legacy_freezing_lbrs_on_pmi(struct kvm_vcpu *vcpu)
+{
+ u64 data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
+ if (data & DEBUGCTLMSR_FREEZE_LBRS_ON_PMI) {
+ data &= ~DEBUGCTLMSR_LBR;
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ }
+}
+
+static void intel_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ u8 version = vcpu_to_pmu(vcpu)->version;
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ return;
+
+ if (version > 1 && version < 4)
+ intel_pmu_legacy_freezing_lbrs_on_pmi(vcpu);
+}
+
+static void vmx_update_intercept_for_lbr_msrs(struct kvm_vcpu *vcpu, bool set)
+{
+ struct x86_pmu_lbr *lbr = vcpu_to_lbr_records(vcpu);
+ int i;
+
+ for (i = 0; i < lbr->nr; i++) {
+ vmx_set_intercept_for_msr(vcpu, lbr->from + i, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, lbr->to + i, MSR_TYPE_RW, set);
+ if (lbr->info)
+ vmx_set_intercept_for_msr(vcpu, lbr->info + i, MSR_TYPE_RW, set);
+ }
+
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_SELECT, MSR_TYPE_RW, set);
+ vmx_set_intercept_for_msr(vcpu, MSR_LBR_TOS, MSR_TYPE_RW, set);
+}
+
+static inline void vmx_disable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, true);
+ lbr_desc->msr_passthrough = false;
+}
+
+static inline void vmx_enable_lbr_msrs_passthrough(struct kvm_vcpu *vcpu)
+{
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (lbr_desc->msr_passthrough)
+ return;
+
+ vmx_update_intercept_for_lbr_msrs(vcpu, false);
+ lbr_desc->msr_passthrough = true;
+}
+
+/*
+ * Higher priority host perf events (e.g. cpu pinned) could reclaim the
+ * pmu resources (e.g. LBR) that were assigned to the guest. This is
+ * usually done via ipi calls (more details in perf_install_in_context).
+ *
+ * Before entering the non-root mode (with irq disabled here), double
+ * confirm that the pmu features enabled to the guest are not reclaimed
+ * by higher priority host events. Otherwise, disallow vcpu's access to
+ * the reclaimed features.
+ */
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+
+ if (!lbr_desc->event) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
+ goto warn;
+ if (test_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use))
+ goto warn;
+ return;
+ }
+
+ if (lbr_desc->event->state < PERF_EVENT_STATE_ACTIVE) {
+ vmx_disable_lbr_msrs_passthrough(vcpu);
+ __clear_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
+ goto warn;
+ } else
+ vmx_enable_lbr_msrs_passthrough(vcpu);
+
+ return;
+
+warn:
+ pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
+ vcpu->vcpu_id);
+}
+
+static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
+{
+ if (!(vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR))
+ intel_pmu_release_guest_lbr_event(vcpu);
}
struct kvm_pmu_ops intel_pmu_ops = {
@@ -445,4 +719,6 @@ struct kvm_pmu_ops intel_pmu_ops = {
.refresh = intel_pmu_refresh,
.init = intel_pmu_init,
.reset = intel_pmu_reset,
+ .deliver_pmi = intel_pmu_deliver_pmi,
+ .cleanup = intel_pmu_cleanup,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index f02962dcc72c..4831bc44ce66 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -54,7 +54,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
dest = cpu_physical_id(cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -104,7 +104,7 @@ static void __pi_post_block(struct kvm_vcpu *vcpu)
dest = cpu_physical_id(vcpu->cpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
@@ -174,7 +174,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
*/
dest = cpu_physical_id(vcpu->pre_pcpu);
- if (x2apic_enabled())
+ if (x2apic_mode)
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index e85aa5faa22d..3a6461694fc2 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -44,7 +44,7 @@
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
-SYM_FUNC_START(vmx_vmenter)
+SYM_FUNC_START_LOCAL(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index eb69fef57485..32cf8287d4a7 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -50,6 +50,7 @@
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
+#include "hyperv.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
@@ -552,7 +553,7 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
- &vcpu->kvm->arch.hyperv.hv_pa_pg;
+ &to_kvm_hv(vcpu->kvm)->hv_pa_pg;
/*
* Synthetic VM-Exit is not enabled in current code and so All
* evmcs in singe VM shares same assist page.
@@ -658,6 +659,14 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_IA32_RTIT_CR3_MATCH:
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
/* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ case MSR_LBR_SELECT:
+ case MSR_LBR_TOS:
+ case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31:
+ case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31:
+ case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31:
+ case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
+ case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
+ /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
return true;
}
@@ -806,7 +815,7 @@ static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
return *p;
}
-void update_exception_bitmap(struct kvm_vcpu *vcpu)
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
@@ -1102,7 +1111,7 @@ static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
{
/* The base must be 128-byte aligned and a legal physical address. */
- return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+ return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128);
}
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
@@ -1577,7 +1586,7 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
@@ -1924,6 +1933,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
goto find_uret_msr;
+ case MSR_IA32_DEBUGCTLMSR:
+ msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ break;
default:
find_uret_msr:
msr = vmx_find_uret_msr(vmx, msr_info->index);
@@ -1947,6 +1959,16 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
return (unsigned long)data;
}
+static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu)
+{
+ u64 debugctl = vmx_supported_debugctl();
+
+ if (!intel_pmu_lbr_is_enabled(vcpu))
+ debugctl &= ~DEBUGCTLMSR_LBR_MASK;
+
+ return debugctl;
+}
+
/*
* Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -1997,14 +2019,29 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
- case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_DEBUGCTLMSR: {
+ u64 invalid = data & ~vcpu_supported_debugctl(vcpu);
+ if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
+ }
+
+ if (invalid)
+ return 1;
+
if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
VM_EXIT_SAVE_DEBUG_CONTROLS)
get_vmcs12(vcpu)->guest_ia32_debugctl = data;
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
-
+ vmcs_write64(GUEST_IA32_DEBUGCTL, data);
+ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
+ (data & DEBUGCTLMSR_LBR))
+ intel_pmu_create_guest_lbr_event(vcpu);
+ return 0;
+ }
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
@@ -2196,6 +2233,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data >> 32) != 0)
return 1;
goto find_uret_msr;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (data && !vcpu_to_pmu(vcpu)->version)
+ return 1;
+ if (data & PMU_CAP_LBR_FMT) {
+ if ((data & PMU_CAP_LBR_FMT) !=
+ (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ return 1;
+ if (!intel_pmu_lbr_is_compatible(vcpu))
+ return 1;
+ }
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
default:
find_uret_msr:
@@ -2265,7 +2314,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
u64 msr;
cr4_set_bits(X86_CR4_VMXE);
- intel_pt_handle_vmx(1);
asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
@@ -2276,7 +2324,6 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
@@ -2299,9 +2346,13 @@ static int hardware_enable(void)
!hv_get_vp_assist_page(cpu))
return -EFAULT;
+ intel_pt_handle_vmx(1);
+
r = kvm_cpu_vmxon(phys_addr);
- if (r)
+ if (r) {
+ intel_pt_handle_vmx(0);
return r;
+ }
if (enable_ept)
ept_sync_global();
@@ -2319,22 +2370,14 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-
-/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
- * tricks.
- */
-static void kvm_cpu_vmxoff(void)
-{
- asm volatile (__ex("vmxoff"));
-
- intel_pt_handle_vmx(0);
- cr4_clear_bits(X86_CR4_VMXE);
-}
-
static void hardware_disable(void)
{
vmclear_local_loaded_vmcss();
- kvm_cpu_vmxoff();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ intel_pt_handle_vmx(0);
}
/*
@@ -2428,7 +2471,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC;
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_BUS_LOCK_DETECTION;
if (cpu_has_sgx())
opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
@@ -2739,7 +2783,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
@@ -2819,7 +2863,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
@@ -3774,7 +3818,7 @@ static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type, bool value)
{
if (value)
@@ -4233,7 +4277,12 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- if (!enable_pml)
+ /*
+ * PML is enabled/disabled when dirty logging of memsmlots changes, but
+ * it needs to be set here when dirty logging is already active, e.g.
+ * if this vCPU was created after dirty logging was enabled.
+ */
+ if (!vcpu->kvm->arch.cpu_dirty_logging_count)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
if (cpu_has_vmx_xsaves()) {
@@ -4251,24 +4300,17 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
}
vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
-
- /*
- * Expose INVPCID if and only if PCID is also exposed to the guest.
- * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
- * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
- * behavior from the guest perspective (it would expect #GP or #PF).
- */
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
-
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
ENABLE_USR_WAIT_PAUSE, false);
+ if (!vcpu->kvm->arch.bus_lock_detection_enabled)
+ exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
+
vmx->secondary_exec_control = exec_control;
}
@@ -4467,23 +4509,23 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
if (init_event)
vmx_clear_hlt(vcpu);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
- enable_irq_window(vcpu);
+ vmx_enable_irq_window(vcpu);
return;
}
@@ -4824,7 +4866,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
}
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
fallthrough;
case BP_VECTOR:
@@ -5049,6 +5091,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
int dr, dr7, reg;
+ int err = 1;
exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
@@ -5057,9 +5100,9 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (!kvm_require_dr(vcpu, dr))
return 1;
- /* Do not handle if the CPL > 0, will trigger GP on re-entry */
- if (!kvm_require_cpl(vcpu, 0))
- return 1;
+ if (kvm_x86_ops.get_cpl(vcpu) > 0)
+ goto out;
+
dr7 = vmcs_readl(GUEST_DR7);
if (dr7 & DR7_GD) {
/*
@@ -5068,7 +5111,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
@@ -5096,14 +5139,15 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (exit_qualification & TYPE_MOV_FROM_DR) {
unsigned long val;
- if (kvm_get_dr(vcpu, dr, &val))
- return 1;
+ kvm_get_dr(vcpu, dr, &val);
kvm_register_write(vcpu, reg, val);
- } else
- if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
- return 1;
+ err = 0;
+ } else {
+ err = kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg));
+ }
- return kvm_skip_emulated_instruction(vcpu);
+out:
+ return kvm_complete_insn_gp(vcpu, err);
}
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
@@ -5177,9 +5221,8 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
u64 new_bv = kvm_read_edx_eax(vcpu);
u32 index = kvm_rcx_read(vcpu);
- if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- return kvm_skip_emulated_instruction(vcpu);
- return 1;
+ int err = kvm_set_xcr(vcpu, index, new_bv);
+ return kvm_complete_insn_gp(vcpu, err);
}
static int handle_apic_access(struct kvm_vcpu *vcpu)
@@ -5600,6 +5643,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
+{
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5656,6 +5706,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
+ [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -5667,7 +5718,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
struct vcpu_vmx *vmx = to_vmx(vcpu);
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+ if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -5720,24 +5771,6 @@ static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-/*
- * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
- * Called before reporting dirty_bitmap to userspace.
- */
-static void kvm_flush_pml_buffers(struct kvm *kvm)
-{
- int i;
- struct kvm_vcpu *vcpu;
- /*
- * We only need to kick vcpu out of guest mode here, as PML buffer
- * is flushed at beginning of all VMEXITs, and it's obvious that only
- * vcpus running in guest are possible to have unflushed GPAs in PML
- * buffer.
- */
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vcpu_kick(vcpu);
-}
-
static void vmx_dump_sel(char *name, uint32_t sel)
{
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
@@ -5908,20 +5941,22 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
* querying dirty_bitmap, we only need to kick all vcpus out of guest
* mode as if vcpus is in root mode, the PML buffer must has been
- * flushed already.
+ * flushed already. Note, PML is never enabled in hardware while
+ * running L2.
*/
- if (enable_pml)
+ if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
/*
@@ -5938,6 +5973,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (is_guest_mode(vcpu)) {
/*
+ * PML is never enabled when running L2, bail immediately if a
+ * PML full exit occurs as something is horribly wrong.
+ */
+ if (exit_reason.basic == EXIT_REASON_PML_FULL)
+ goto unexpected_vmexit;
+
+ /*
* The host physical addresses of some pages of guest memory
* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
* Page). The CPU may write to these pages via their host
@@ -5954,11 +5996,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 1;
}
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ if (exit_reason.failed_vmentry) {
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
+ = exit_reason.full;
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -5980,18 +6022,18 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* will cause infinite loop.
*/
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_PML_FULL &&
- exit_reason != EXIT_REASON_APIC_ACCESS &&
- exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
+ if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.ndata++;
vcpu->run->internal.data[3] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -6023,42 +6065,62 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
- if (exit_reason >= kvm_vmx_max_exit_handlers)
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
#ifdef CONFIG_RETPOLINE
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
- else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
- else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
return handle_interrupt_window(vcpu);
- else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
return handle_external_interrupt(vcpu);
- else if (exit_reason == EXIT_REASON_HLT)
+ else if (exit_reason.basic == EXIT_REASON_HLT)
return kvm_emulate_halt(vcpu);
- else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
return handle_ept_misconfig(vcpu);
#endif
- exit_reason = array_index_nospec(exit_reason,
- kvm_vmx_max_exit_handlers);
- if (!kvm_vmx_exit_handlers[exit_reason])
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
goto unexpected_vmexit;
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
unexpected_vmexit:
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason.full);
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[0] = exit_reason.full;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+{
+ int ret = __vmx_handle_exit(vcpu, exit_fastpath);
+
+ /*
+ * Even when current exit reason is handled by KVM internally, we
+ * still need to exit to user space when bus lock detected to inform
+ * that there is a bus lock in guest.
+ */
+ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (ret > 0)
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+ return 0;
+ }
+ return ret;
+}
+
/*
* Software based L1D cache flush which is used when microcode providing
* the cache control MSR is not loaded.
@@ -6129,7 +6191,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6373,9 +6435,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+ else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
}
@@ -6518,8 +6580,8 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
int i, nr_msrs;
struct perf_guest_switch_msr *msrs;
+ /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */
msrs = perf_guest_get_msrs(&nr_msrs);
-
if (!msrs)
return;
@@ -6567,7 +6629,7 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- switch (to_vmx(vcpu)->exit_reason) {
+ switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
@@ -6577,8 +6639,6 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
}
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx)
{
@@ -6638,11 +6698,9 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
- fastpath_t exit_fastpath;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
-reenter_guest:
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6696,6 +6754,8 @@ reenter_guest:
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
+ if (intel_pmu_lbr_is_enabled(vcpu))
+ vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -6734,12 +6794,12 @@ reenter_guest:
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs))
+ if (static_branch_unlikely(&enable_evmcs)) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- if (static_branch_unlikely(&enable_evmcs))
- current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+ current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
+ }
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (vmx->host_debugctlmsr)
@@ -6768,21 +6828,23 @@ reenter_guest:
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
- vmx->exit_reason = 0xdead;
+ vmx->exit_reason.full = 0xdead;
return EXIT_FASTPATH_NONE;
}
- vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
- if (unlikely((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY))
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+ if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
+
+ trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
- vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
@@ -6790,22 +6852,7 @@ reenter_guest:
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
- if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
- if (!kvm_vcpu_exit_request(vcpu)) {
- /*
- * FIXME: this goto should be a loop in vcpu_enter_guest,
- * but it would incur the cost of a retpoline for now.
- * Revisit once static calls are available.
- */
- if (vcpu->arch.apicv_active)
- vmx_sync_pir_to_irr(vcpu);
- goto reenter_guest;
- }
- exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
- }
-
- return exit_fastpath;
+ return vmx_exit_handlers_fastpath(vcpu);
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -7256,7 +7303,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
set_cr4_guest_host_mask(vmx);
/* Refresh #PF interception to account for MAXPHYADDR changes. */
- update_exception_bitmap(vcpu);
+ vmx_update_exception_bitmap(vcpu);
}
static __init void vmx_set_cpu_caps(void)
@@ -7270,8 +7317,8 @@ static __init void vmx_set_cpu_caps(void)
/* CPUID 0x7 */
if (kvm_mpx_supported())
kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
- if (cpu_has_vmx_invpcid())
- kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+ if (!cpu_has_vmx_invpcid())
+ kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
if (vmx_pt_mode_is_host_guest())
kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
@@ -7449,30 +7496,24 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
-static void vmx_slot_enable_log_dirty(struct kvm *kvm,
- struct kvm_memory_slot *slot)
-{
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
- kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
-}
-
-static void vmx_slot_disable_log_dirty(struct kvm *kvm,
- struct kvm_memory_slot *slot)
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
- kvm_mmu_slot_set_dirty(kvm, slot);
-}
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
-static void vmx_flush_log_dirty(struct kvm *kvm)
-{
- kvm_flush_pml_buffers(kvm);
-}
+ if (is_guest_mode(vcpu)) {
+ vmx->nested.update_vmcs01_cpu_dirty_logging = true;
+ return;
+ }
-static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- gfn_t offset, unsigned long mask)
-{
- kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+ /*
+ * Note, cpu_dirty_logging_count can be changed concurrent with this
+ * code, but in that case another update request will be made and so
+ * the guest will never run with a stale PML value.
+ */
+ if (vcpu->kvm->arch.cpu_dirty_logging_count)
+ secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
+ else
+ secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
static int vmx_pre_block(struct kvm_vcpu *vcpu)
@@ -7546,7 +7587,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
return 0;
}
-static void enable_smi_window(struct kvm_vcpu *vcpu)
+static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
@@ -7582,11 +7623,6 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
-static int vmx_cpu_dirty_log_size(void)
-{
- return enable_pml ? PML_ENTITY_NUM : 0;
-}
-
static struct kvm_x86_ops vmx_x86_ops __initdata = {
.hardware_unsetup = hardware_unsetup,
@@ -7606,7 +7642,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
- .update_exception_bitmap = update_exception_bitmap,
+ .update_exception_bitmap = vmx_update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
@@ -7649,9 +7685,9 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.nmi_allowed = vmx_nmi_allowed,
.get_nmi_mask = vmx_get_nmi_mask,
.set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
@@ -7686,10 +7722,8 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.sched_in = vmx_sched_in,
- .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
- .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
- .flush_log_dirty = vmx_flush_log_dirty,
- .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
.pre_block = vmx_pre_block,
.post_block = vmx_post_block,
@@ -7709,7 +7743,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.smi_allowed = vmx_smi_allowed,
.pre_enter_smm = vmx_pre_enter_smm,
.pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
+ .enable_smi_window = vmx_enable_smi_window,
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -7717,7 +7751,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.msr_filter_changed = vmx_msr_filter_changed,
.complete_emulated_msr = kvm_complete_insn_gp,
- .cpu_dirty_log_size = vmx_cpu_dirty_log_size,
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
@@ -7810,6 +7843,8 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}
+ kvm_has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
+
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
@@ -7832,13 +7867,8 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml) {
- vmx_x86_ops.slot_enable_log_dirty = NULL;
- vmx_x86_ops.slot_disable_log_dirty = NULL;
- vmx_x86_ops.flush_log_dirty = NULL;
- vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
- vmx_x86_ops.cpu_dirty_log_size = NULL;
- }
+ if (!enable_pml)
+ vmx_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 9d3a557949ac..89da5e1251f1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -70,6 +70,54 @@ struct pt_desc {
struct pt_ctx guest;
};
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+#define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
+#define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
+
+bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu);
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -117,6 +165,7 @@ struct nested_vmx {
bool change_vmcs01_virtual_apic_mode;
bool reload_vmcs01_apic_access_page;
+ bool update_vmcs01_cpu_dirty_logging;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -244,7 +293,7 @@ struct vcpu_vmx {
int vpid;
bool emulation_required;
- u32 exit_reason;
+ union vmx_exit_reason exit_reason;
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -279,6 +328,7 @@ struct vcpu_vmx {
u64 ept_pointer;
struct pt_desc pt_desc;
+ struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
@@ -329,7 +379,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
int root_level);
-void update_exception_bitmap(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
@@ -339,8 +389,12 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
+void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
static inline u8 vmx_get_rvi(void)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b404e4d7dd8..fe806e894212 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -29,6 +29,7 @@
#include "pmu.h"
#include "hyperv.h"
#include "lapic.h"
+#include "xen.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -114,11 +115,21 @@ static int sync_regs(struct kvm_vcpu *vcpu);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
+#define KVM_X86_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \
+ *(((struct kvm_x86_ops *)0)->func));
+#define KVM_X86_OP_NULL KVM_X86_OP
+#include <asm/kvm-x86-ops.h>
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current);
+
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
-static bool __read_mostly report_ignored_msrs = true;
+bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+EXPORT_SYMBOL_GPL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
@@ -136,6 +147,8 @@ u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
u64 __read_mostly kvm_default_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
+bool __read_mostly kvm_has_bus_lock_exit;
+EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -234,7 +247,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pte_updated", mmu_pte_updated),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
VM_STAT("mmu_flooded", mmu_flooded),
VM_STAT("mmu_recycled", mmu_recycled),
@@ -395,7 +407,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
- u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff |
+ u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
@@ -484,19 +496,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
*/
vcpu->arch.dr6 &= ~DR_TRAP_BITS;
/*
- * DR6.RTM is set by all #DB exceptions that don't clear it.
+ * In order to reflect the #DB exception payload in guest
+ * dr6, three components need to be considered: active low
+ * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
+ * DR6_BS and DR6_BT)
+ * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
+ * In the target guest dr6:
+ * FIXED_1 bits should always be set.
+ * Active low bits should be cleared if 1-setting in payload.
+ * Active high bits should be set if 1-setting in payload.
+ *
+ * Note, the payload is compatible with the pending debug
+ * exceptions/exit qualification under VMX, that active_low bits
+ * are active high in payload.
+ * So they need to be flipped for DR6.
*/
- vcpu->arch.dr6 |= DR6_RTM;
+ vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
vcpu->arch.dr6 |= payload;
- /*
- * Bit 16 should be set in the payload whenever the #DB
- * exception should clear DR6.RTM. This makes the payload
- * compatible with the pending debug exceptions under VMX.
- * Though not currently documented in the SDM, this also
- * makes the payload compatible with the exit qualification
- * for #DB exceptions under VMX.
- */
- vcpu->arch.dr6 ^= payload & DR6_RTM;
+ vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
/*
* The #DB payload is defined as compatible with the 'pending
@@ -692,7 +709,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
+ if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -742,8 +759,7 @@ static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
- return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) |
- rsvd_bits(1, 2);
+ return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
}
/*
@@ -852,7 +868,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!is_pae(vcpu))
return 1;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
if (cs_l)
return 1;
}
@@ -865,7 +881,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
@@ -970,12 +986,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
- __kvm_set_xcr(vcpu, index, xcr)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
+ if (static_call(kvm_x86_get_cpl)(vcpu) == 0)
+ return __kvm_set_xcr(vcpu, index, xcr);
+
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
@@ -987,7 +1001,7 @@ bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return false;
- return kvm_x86_ops.is_valid_cr4(vcpu, cr4);
+ return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
}
EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
@@ -1031,7 +1045,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
- kvm_x86_ops.set_cr4(vcpu, cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
@@ -1059,8 +1073,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
return 0;
}
- if (is_long_mode(vcpu) &&
- (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
+ if (is_long_mode(vcpu) && kvm_vcpu_is_illegal_gpa(vcpu, cr3))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
@@ -1114,7 +1127,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- kvm_x86_ops.set_dr7(vcpu, dr7);
+ static_call(kvm_x86_set_dr7)(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
@@ -1130,7 +1143,7 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
return fixed;
}
-static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1143,13 +1156,13 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 4:
case 6:
if (!kvm_dr6_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
break;
case 5:
default: /* 7 */
if (!kvm_dr7_valid(val))
- return -1; /* #GP */
+ return 1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
break;
@@ -1157,18 +1170,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
-
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
-{
- if (__kvm_set_dr(vcpu, dr, val)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- return 0;
-}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
@@ -1185,7 +1189,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
*val = vcpu->arch.dr7;
break;
}
- return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1426,7 +1429,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- return kvm_x86_ops.get_msr_feature(msr);
+ return static_call(kvm_x86_get_msr_feature)(msr);
}
return 0;
}
@@ -1502,7 +1505,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- r = kvm_x86_ops.set_efer(vcpu, efer);
+ r = static_call(kvm_x86_set_efer)(vcpu, efer);
if (r) {
WARN_ON(r > 0);
return r;
@@ -1523,35 +1526,44 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
struct kvm *kvm = vcpu->kvm;
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
- u32 count = kvm->arch.msr_filter.count;
- u32 i;
- bool r = kvm->arch.msr_filter.default_allow;
+ bool allowed;
int idx;
+ u32 i;
- /* MSR filtering not set up or x2APIC enabled, allow everything */
- if (!count || (index >= 0x800 && index <= 0x8ff))
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
return true;
- /* Prevent collision with set_msr_filter */
idx = srcu_read_lock(&kvm->srcu);
- for (i = 0; i < count; i++) {
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
u32 start = ranges[i].base;
u32 end = start + ranges[i].nmsrs;
u32 flags = ranges[i].flags;
unsigned long *bitmap = ranges[i].bitmap;
if ((index >= start) && (index < end) && (flags & type)) {
- r = !!test_bit(index - start, bitmap);
+ allowed = !!test_bit(index - start, bitmap);
break;
}
}
+out:
srcu_read_unlock(&kvm->srcu, idx);
- return r;
+ return allowed;
}
EXPORT_SYMBOL_GPL(kvm_msr_allowed);
@@ -1599,7 +1611,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
msr.index = index;
msr.host_initiated = host_initiated;
- return kvm_x86_ops.set_msr(vcpu, &msr);
+ return static_call(kvm_x86_set_msr)(vcpu, &msr);
}
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
@@ -1632,7 +1644,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
msr.index = index;
msr.host_initiated = host_initiated;
- ret = kvm_x86_ops.get_msr(vcpu, &msr);
+ ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
@@ -1673,12 +1685,12 @@ static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, err);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, err);
}
static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.complete_emulated_msr(vcpu, vcpu->run->msr.error);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
}
static u64 kvm_msr_reason(int r)
@@ -1750,7 +1762,7 @@ int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
trace_kvm_msr_read_ex(ecx);
}
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
@@ -1776,16 +1788,16 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
else
trace_kvm_msr_write_ex(ecx, data);
- return kvm_x86_ops.complete_emulated_msr(vcpu, r);
+ return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
+ xfer_to_guest_mode_prepare();
return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
xfer_to_guest_mode_work_pending();
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
/*
* The fast path for frequent and performance sensitive wrmsr emulation,
@@ -1935,15 +1947,14 @@ static s64 get_kvmclock_base_ns(void)
}
#endif
-static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
{
int version;
int r;
struct pvclock_wall_clock wc;
+ u32 wc_sec_hi;
u64 wall_nsec;
- kvm->arch.wall_clock = wall_clock;
-
if (!wall_clock)
return;
@@ -1972,6 +1983,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+ if (sec_hi_ofs) {
+ wc_sec_hi = wall_nsec >> 32;
+ kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
+ &wc_sec_hi, sizeof(wc_sec_hi));
+ }
+
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
@@ -2208,7 +2225,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2543,6 +2560,8 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
struct kvm_vcpu *vcpu;
struct kvm_arch *ka = &kvm->arch;
+ kvm_hv_invalidate_tsc_page(kvm);
+
spin_lock(&ka->pvclock_gtod_sync_lock);
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
@@ -2591,13 +2610,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return ret;
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
+ struct gfn_to_hva_cache *cache,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
+ if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
+ &guest_hv_clock, offset, sizeof(guest_hv_clock))))
return;
/* This VCPU is paused, but it's legal for a guest to read another
@@ -2620,9 +2641,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
++guest_hv_clock.version; /* first time write, random junk */
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -2636,16 +2657,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_write_guest_offset_cached(v->kvm, cache,
+ &vcpu->hv_clock, offset,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2732,7 +2753,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v);
+ kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_set)
+ kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
if (v == kvm_get_vcpu(v->kvm, 0))
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
@@ -2857,32 +2883,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
- int lm = is_long_mode(vcpu);
- u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
- : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
- u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
- : kvm->arch.xen_hvm_config.blob_size_32;
- u32 page_num = data & ~PAGE_MASK;
- u64 page_addr = data & PAGE_MASK;
- u8 *page;
-
- if (page_num >= blob_size)
- return 1;
-
- page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page))
- return PTR_ERR(page);
-
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
- kfree(page);
- return 1;
- }
- return 0;
-}
-
static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
{
u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
@@ -2954,13 +2954,13 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_all(vcpu);
+ static_call(kvm_x86_tlb_flush_all)(vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_guest(vcpu);
+ static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -2968,6 +2968,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
struct kvm_host_map map;
struct kvm_steal_time *st;
+ if (kvm_xen_msr_enabled(vcpu->kvm)) {
+ kvm_xen_runstate_set_running(vcpu);
+ return;
+ }
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -3016,6 +3021,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
u32 msr = msr_info->index;
u64 data = msr_info->data;
+ if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ return kvm_xen_write_hypercall_page(vcpu, data);
+
switch (msr) {
case MSR_AMD64_NB_CFG:
case MSR_IA32_UCODE_WRITE:
@@ -3072,18 +3080,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
break;
- case MSR_IA32_DEBUGCTLMSR:
- if (!data) {
- /* We support the non-activated case already */
- break;
- } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
- /* Values other than LBR and BTF are vendor-specific,
- thus reserved and should throw a #GP */
- return 1;
- } else if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
- break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
@@ -3152,13 +3148,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_WALL_CLOCK:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
return 1;
- kvm_write_wall_clock(vcpu->kvm, data);
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data, 0);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
@@ -3303,8 +3301,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.msr_misc_features_enables = data;
break;
default:
- if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
- return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
return KVM_MSR_RET_INVALID;
@@ -3356,7 +3352,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
- case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
@@ -3738,7 +3733,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PIT2:
case KVM_CAP_PIT_STATE2:
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
- case KVM_CAP_XEN_HVM:
case KVM_CAP_VCPU_EVENTS:
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
@@ -3778,6 +3772,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
r = 1;
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_CAP_XEN_HVM:
+ r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO;
+ if (sched_info_on())
+ r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
+ break;
+#endif
case KVM_CAP_SYNC_REGS:
r = KVM_SYNC_X86_VALID_FIELDS;
break;
@@ -3799,10 +3802,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops.has_emulated_msr(kvm, MSR_IA32_SMBASE);
+ r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
- r = !kvm_x86_ops.cpu_has_accelerated_tpr();
+ r = !static_call(kvm_x86_cpu_has_accelerated_tpr)();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
@@ -3844,6 +3847,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_STEAL_TIME:
r = sched_info_on();
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ if (kvm_has_bus_lock_exit)
+ r = KVM_BUS_LOCK_DETECTION_OFF |
+ KVM_BUS_LOCK_DETECTION_EXIT;
+ else
+ r = 0;
+ break;
default:
break;
}
@@ -3961,14 +3971,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops.has_wbinvd_exit())
+ if (static_call(kvm_x86_has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- kvm_x86_ops.vcpu_load(vcpu, cpu);
+ static_call(kvm_x86_vcpu_load)(vcpu, cpu);
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -4014,6 +4024,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
+ int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4021,9 +4032,15 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- return;
+ goto out;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4031,33 +4048,22 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+
+out:
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- int idx;
-
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
- /*
- * Disable page faults because we're in atomic context here.
- * kvm_write_guest_offset_cached() would call might_fault()
- * that relies on pagefault_disable() to tell if there's a
- * bug. NOTE: the write to guest memory may not go through if
- * during postcopy live migration or if there's heavy guest
- * paging.
- */
- pagefault_disable();
- /*
- * kvm_memslots() will be called by
- * kvm_write_guest_offset_cached() so take the srcu lock.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- pagefault_enable();
- kvm_x86_ops.vcpu_put(vcpu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+
+ static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -4071,7 +4077,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -4181,7 +4187,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- kvm_x86_ops.setup_mce(vcpu);
+ static_call(kvm_x86_setup_mce)(vcpu);
out:
return r;
}
@@ -4288,11 +4294,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
+ events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -4359,13 +4365,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops.set_interrupt_shadow(vcpu,
- events->interrupt.shadow);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu,
+ events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -4421,9 +4427,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- if (dbgregs->dr6 & ~0xffffffffull)
+ if (!kvm_dr6_valid(dbgregs->dr6))
return -EINVAL;
- if (dbgregs->dr7 & ~0xffffffffull)
+ if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
@@ -4660,7 +4666,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (!kvm_x86_ops.enable_direct_tlbflush)
return -ENOTTY;
- return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+ return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
@@ -5031,6 +5037,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
break;
+#ifdef CONFIG_KVM_XEN
+ case KVM_XEN_VCPU_GET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_get_attr(vcpu, &xva);
+ if (!r && copy_to_user(argp, &xva, sizeof(xva)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_VCPU_SET_ATTR: {
+ struct kvm_xen_vcpu_attr xva;
+
+ r = -EFAULT;
+ if (copy_from_user(&xva, argp, sizeof(xva)))
+ goto out;
+ r = kvm_xen_vcpu_set_attr(vcpu, &xva);
+ break;
+ }
+#endif
default:
r = -EINVAL;
}
@@ -5052,14 +5080,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = kvm_x86_ops.set_tss_addr(kvm, addr);
+ ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
+ return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -5213,11 +5241,18 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
+
/*
- * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called
+ * before reporting dirty_bitmap to userspace. KVM flushes the buffers
+ * on all VM-Exits, thus we only need to kick running vCPUs to force a
+ * VM-Exit.
*/
- if (kvm_x86_ops.flush_log_dirty)
- kvm_x86_ops.flush_log_dirty(kvm);
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -5307,6 +5342,20 @@ split_irqchip_unlock:
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_BUS_LOCK_EXIT:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
+ break;
+
+ if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
+ (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
+ break;
+
+ if (kvm_has_bus_lock_exit &&
+ cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
+ kvm->arch.bus_lock_detection_enabled = true;
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5314,25 +5363,34 @@ split_irqchip_unlock:
return r;
}
-static void kvm_clear_msr_filter(struct kvm *kvm)
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
{
u32 i;
- u32 count = kvm->arch.msr_filter.count;
- struct msr_bitmap_range ranges[16];
- mutex_lock(&kvm->lock);
- kvm->arch.msr_filter.count = 0;
- memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0]));
- mutex_unlock(&kvm->lock);
- synchronize_srcu(&kvm->srcu);
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
- for (i = 0; i < count; i++)
- kfree(ranges[i].bitmap);
+ kfree(msr_filter);
}
-static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range)
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges;
struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
@@ -5366,11 +5424,9 @@ static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user
goto err;
}
- /* Everything ok, add this range identifier to our global pool */
- ranges[kvm->arch.msr_filter.count] = range;
- /* Make sure we filled the array before we tell anyone to walk it */
- smp_wmb();
- kvm->arch.msr_filter.count++;
+ /* Everything ok, add this range identifier. */
+ msr_filter->ranges[msr_filter->count] = range;
+ msr_filter->count++;
return 0;
err:
@@ -5381,10 +5437,11 @@ err:
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
{
struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
struct kvm_msr_filter filter;
bool default_allow;
- int r = 0;
bool empty = true;
+ int r = 0;
u32 i;
if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
@@ -5397,25 +5454,32 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
if (empty && !default_allow)
return -EINVAL;
- kvm_clear_msr_filter(kvm);
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
- kvm->arch.msr_filter.default_allow = default_allow;
-
- /*
- * Protect from concurrent calls to this function that could trigger
- * a TOCTOU violation on kvm->arch.msr_filter.count.
- */
- mutex_lock(&kvm->lock);
for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
- r = kvm_add_msr_filter(kvm, &filter.ranges[i]);
- if (r)
- break;
+ r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
}
+ mutex_lock(&kvm->lock);
+
+ /* The per-VM filter is protected by kvm->lock... */
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
mutex_unlock(&kvm->lock);
- return r;
+ return 0;
}
long kvm_arch_vm_ioctl(struct file *filp,
@@ -5631,18 +5695,36 @@ set_pit2_out:
kvm->arch.bsp_vcpu_id = arg;
mutex_unlock(&kvm->lock);
break;
+#ifdef CONFIG_KVM_XEN
case KVM_XEN_HVM_CONFIG: {
struct kvm_xen_hvm_config xhc;
r = -EFAULT;
if (copy_from_user(&xhc, argp, sizeof(xhc)))
goto out;
- r = -EINVAL;
- if (xhc.flags)
+ r = kvm_xen_hvm_config(kvm, &xhc);
+ break;
+ }
+ case KVM_XEN_HVM_GET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
goto out;
- memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
- r = 0;
+ r = kvm_xen_hvm_get_attr(kvm, &xha);
+ if (!r && copy_to_user(argp, &xha, sizeof(xha)))
+ r = -EFAULT;
+ break;
+ }
+ case KVM_XEN_HVM_SET_ATTR: {
+ struct kvm_xen_hvm_attr xha;
+
+ r = -EFAULT;
+ if (copy_from_user(&xha, argp, sizeof(xha)))
+ goto out;
+ r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
+#endif
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
@@ -5685,7 +5767,7 @@ set_pit2_out:
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
- r = kvm_x86_ops.mem_enc_op(kvm, argp);
+ r = static_call(kvm_x86_mem_enc_op)(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5697,7 +5779,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_reg_region)
- r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_reg_region)(kvm, &region);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5709,7 +5791,7 @@ set_pit2_out:
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_unreg_region)
- r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
+ r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, &region);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -5811,7 +5893,7 @@ static void kvm_init_msr_list(void)
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops.has_emulated_msr(NULL, emulated_msrs_all[i]))
+ if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -5874,13 +5956,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.set_segment(vcpu, var, seg);
+ static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops.get_segment(vcpu, var, seg);
+ static_call(kvm_x86_get_segment)(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -5900,14 +5982,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5915,7 +5997,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5964,7 +6046,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -5989,7 +6071,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -6010,7 +6092,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -6063,7 +6145,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
- if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
+ if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -6088,7 +6170,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -6122,7 +6204,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -6530,7 +6612,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops.get_segment_base(vcpu, seg);
+ return static_call(kvm_x86_get_segment_base)(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -6543,11 +6625,11 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (kvm_x86_ops.has_wbinvd_exit()) {
+ if (static_call(kvm_x86_has_wbinvd_exit)()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
@@ -6570,17 +6652,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+ unsigned long *dest)
{
- return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
unsigned long value)
{
- return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
+ return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
}
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -6648,27 +6730,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
+ return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
+ static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -6810,7 +6892,7 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+ return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
&ctxt->exception);
}
@@ -6848,7 +6930,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
+ static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -6864,7 +6946,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
@@ -6926,7 +7008,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
+ u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -6937,7 +7019,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -6979,7 +7061,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
@@ -7040,7 +7122,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -7100,9 +7182,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
- spin_lock(&vcpu->kvm->mmu_lock);
+ write_lock(&vcpu->kvm->mmu_lock);
indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- spin_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
if (indirect_shadow_pages)
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -7209,7 +7291,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7221,10 +7303,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
int r;
- r = kvm_x86_ops.skip_emulated_instruction(vcpu);
+ r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
if (unlikely(!r))
return 0;
@@ -7253,7 +7335,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -7310,6 +7392,42 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+/*
+ * Decode to be emulated instruction. Return EMULATION_OK if success.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ int r = EMULATION_OK;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since we do not set
+ * complete_userspace_io. This does not handle watchpoints yet,
+ * those would be handled in the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
{
@@ -7318,7 +7436,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -7329,32 +7447,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
*/
write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
- kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
-
- /*
- * We will reenter on the same instruction since
- * we do not set complete_userspace_io. This does not
- * handle watchpoints yet, those would be handled in
- * the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
-
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+ kvm_clear_exception_queue(vcpu);
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
if (r != EMULATION_OK) {
if ((emulation_type & EMULTYPE_TRAP_UD) ||
(emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
@@ -7461,7 +7559,7 @@ restart:
r = 1;
if (writeback) {
- unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
@@ -7470,7 +7568,7 @@ restart:
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
if (kvm_x86_ops.update_emulated_instruction)
- kvm_x86_ops.update_emulated_instruction(vcpu);
+ static_call(kvm_x86_update_emulated_instruction)(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7799,7 +7897,7 @@ static int kvm_is_user_mode(void)
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
+ user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -7944,7 +8042,6 @@ int kvm_arch_init(void *opaque)
supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
- kvm_lapic_init();
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
#ifdef CONFIG_X86_64
@@ -7986,6 +8083,10 @@ void kvm_arch_exit(void)
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_fpu_cache);
+#ifdef CONFIG_KVM_XEN
+ static_key_deferred_flush(&kvm_xen_enabled);
+ WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
+#endif
}
static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason)
@@ -8037,7 +8138,7 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
return -KVM_EOPNOTSUPP;
- if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ if (!kvm_get_walltime_and_clockread(&ts, &cycle))
return -KVM_EOPNOTSUPP;
clock_pairing.sec = ts.tv_sec;
@@ -8113,7 +8214,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit;
- if (kvm_hv_hypercall_enabled(vcpu->kvm))
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
nr = kvm_rax_read(vcpu);
@@ -8133,7 +8237,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (kvm_x86_ops.get_cpl(vcpu) != 0) {
+ if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
@@ -8190,7 +8294,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- kvm_x86_ops.patch_hypercall(vcpu, instruction);
+ static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -8214,12 +8318,14 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->if_flag = !vcpu->arch.guest_state_protected
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
+
+ if (is_smm(vcpu))
+ kvm_run->flags |= KVM_RUN_X86_SMM;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -8245,7 +8351,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
tpr = kvm_lapic_get_cr8(vcpu);
- kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
+ static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
}
static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
@@ -8256,7 +8362,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected) {
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
/*
@@ -8275,10 +8381,10 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*/
else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
- kvm_x86_ops.set_irq(vcpu);
+ static_call(kvm_x86_set_irq)(vcpu);
can_inject = false;
}
}
@@ -8319,7 +8425,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
}
}
- kvm_x86_ops.queue_exception(vcpu);
+ static_call(kvm_x86_queue_exception)(vcpu);
can_inject = false;
}
@@ -8335,7 +8441,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
if (vcpu->arch.smi_pending) {
- r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
@@ -8344,35 +8450,35 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
enter_smm(vcpu);
can_inject = false;
} else
- kvm_x86_ops.enable_smi_window(vcpu);
+ static_call(kvm_x86_enable_smi_window)(vcpu);
}
if (vcpu->arch.nmi_pending) {
- r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
- kvm_x86_ops.set_nmi(vcpu);
+ static_call(kvm_x86_set_nmi)(vcpu);
can_inject = false;
- WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+ WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
}
if (vcpu->arch.nmi_pending)
- kvm_x86_ops.enable_nmi_window(vcpu);
+ static_call(kvm_x86_enable_nmi_window)(vcpu);
}
if (kvm_cpu_has_injectable_intr(vcpu)) {
- r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+ r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
goto busy;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
- kvm_x86_ops.set_irq(vcpu);
- WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+ static_call(kvm_x86_set_irq)(vcpu);
+ WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
}
if (kvm_cpu_has_injectable_intr(vcpu))
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
}
if (is_guest_mode(vcpu) &&
@@ -8397,7 +8503,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
- if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -8487,11 +8593,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
@@ -8541,7 +8647,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
@@ -8551,7 +8657,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
@@ -8581,30 +8687,30 @@ static void enter_smm(struct kvm_vcpu *vcpu)
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
- kvm_x86_ops.pre_enter_smm(vcpu, buf);
+ static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
- if (kvm_x86_ops.get_nmi_mask(vcpu))
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- kvm_x86_ops.set_nmi_mask(vcpu, true);
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- kvm_x86_ops.set_cr0(vcpu, cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_x86_ops.set_cr4(vcpu, 0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
- __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+ kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
cs.base = vcpu->arch.smbase;
@@ -8633,7 +8739,7 @@ static void enter_smm(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- kvm_x86_ops.set_efer(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
#endif
kvm_update_cpuid_runtime(vcpu);
@@ -8671,7 +8777,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
kvm_apic_update_apicv(vcpu);
- kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+ static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -8688,7 +8794,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
unsigned long old, new, expected;
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
- !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+ !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
@@ -8708,7 +8814,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
trace_kvm_apicv_update_request(activate, bit);
if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+ static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
/*
* Sending request to update APICV for all other vcpus,
@@ -8734,7 +8840,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -8752,9 +8858,12 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
- vcpu_to_synic(vcpu)->vec_bitmap, 256);
- kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ if (to_hv_vcpu(vcpu))
+ bitmap_or((ulong *)eoi_exit_bitmap,
+ vcpu->arch.ioapic_handled_vectors,
+ to_hv_synic(vcpu)->vec_bitmap, 256);
+
+ static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8779,7 +8888,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- kvm_x86_ops.set_apic_access_page_addr(vcpu);
+ static_call(kvm_x86_set_apic_access_page_addr)(vcpu);
}
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -8904,8 +9013,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ vcpu->run->hyperv = hv_vcpu->exit;
r = 0;
goto out;
}
@@ -8922,10 +9033,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_ops.msr_filter_changed(vcpu);
+ static_call(kvm_x86_msr_filter_changed)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
+ static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
}
- if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
+ kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -8935,7 +9050,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
inject_pending_event(vcpu, &req_immediate_exit);
if (req_int_win)
- kvm_x86_ops.enable_irq_window(vcpu);
+ static_call(kvm_x86_enable_irq_window)(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -8950,7 +9065,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
- kvm_x86_ops.prepare_guest_switch(vcpu);
+ static_call(kvm_x86_prepare_guest_switch)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -8981,7 +9096,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- kvm_x86_ops.sync_pir_to_irr(vcpu);
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -8995,7 +9110,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.request_immediate_exit(vcpu);
+ static_call(kvm_x86_request_immediate_exit)(vcpu);
}
fpregs_assert_state_consistent();
@@ -9012,7 +9127,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- exit_fastpath = kvm_x86_ops.run(vcpu);
+ for (;;) {
+ exit_fastpath = static_call(kvm_x86_run)(vcpu);
+ if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
+ break;
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ break;
+ }
+
+ if (vcpu->arch.apicv_active)
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9022,7 +9149,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- kvm_x86_ops.sync_dirty_debug_regs(vcpu);
+ static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
@@ -9044,7 +9171,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops.handle_exit_irqoff(vcpu);
+ static_call(kvm_x86_handle_exit_irqoff)(vcpu);
/*
* Consume any pending interrupts, including the possible source of
@@ -9086,13 +9213,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
+ r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return r;
cancel_injection:
if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops.cancel_injection(vcpu);
+ static_call(kvm_x86_cancel_injection)(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -9102,13 +9229,13 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
+ (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
if (kvm_x86_ops.post_block)
- kvm_x86_ops.post_block(vcpu);
+ static_call(kvm_x86_post_block)(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -9329,6 +9456,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
+ kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
@@ -9503,10 +9631,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops.get_idt(vcpu, &dt);
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- kvm_x86_ops.get_gdt(vcpu, &dt);
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -9624,7 +9752,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
*/
if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
return false;
- if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+ if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
return false;
} else {
/*
@@ -9659,10 +9787,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- kvm_x86_ops.set_idt(vcpu, &dt);
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- kvm_x86_ops.set_gdt(vcpu, &dt);
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
@@ -9672,14 +9800,14 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops.set_efer(vcpu, sregs->efer);
+ static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
+ static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
- kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
+ static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9787,7 +9915,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops.update_exception_bitmap(vcpu);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
r = 0;
@@ -9965,7 +10093,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm))
vcpu->arch.apicv_active = true;
} else
- static_key_slow_inc(&kvm_no_apic_vcpu);
+ static_branch_inc(&kvm_has_noapic_vcpu);
r = -ENOMEM;
@@ -10003,7 +10131,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+ vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
@@ -10013,9 +10141,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
- kvm_hv_vcpu_init(vcpu);
-
- r = kvm_x86_ops.vcpu_create(vcpu);
+ r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
@@ -10051,8 +10177,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
- kvm_hv_vcpu_postcreate(vcpu);
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -10078,7 +10202,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
- kvm_x86_ops.vcpu_free(vcpu);
+ static_call(kvm_x86_vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -10095,7 +10219,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
+ static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -10114,7 +10238,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
- vcpu->arch.dr6 = DR6_INIT;
+ vcpu->arch.dr6 = DR6_ACTIVE_LOW;
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -10167,7 +10291,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
- kvm_x86_ops.vcpu_reset(vcpu, init_event);
+ static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10193,7 +10317,7 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
- ret = kvm_x86_ops.hardware_enable();
+ ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -10275,7 +10399,7 @@ int kvm_arch_hardware_enable(void)
void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops.hardware_disable();
+ static_call(kvm_x86_hardware_disable)();
drop_user_return_notifiers();
}
@@ -10294,6 +10418,7 @@ int kvm_arch_hardware_setup(void *opaque)
return r;
memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+ kvm_ops_static_call_update();
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
supported_xss = 0;
@@ -10322,7 +10447,7 @@ int kvm_arch_hardware_setup(void *opaque)
void kvm_arch_hardware_unsetup(void)
{
- kvm_x86_ops.hardware_unsetup();
+ static_call(kvm_x86_hardware_unsetup)();
}
int kvm_arch_check_processor_compat(void *opaque)
@@ -10350,8 +10475,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-struct static_key kvm_no_apic_vcpu __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
@@ -10362,12 +10487,12 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
pmu->need_cleanup = true;
kvm_make_request(KVM_REQ_PMU, vcpu);
}
- kvm_x86_ops.sched_in(vcpu, cpu);
+ static_call(kvm_x86_sched_in)(vcpu, cpu);
}
void kvm_arch_free_vm(struct kvm *kvm)
{
- kfree(kvm->arch.hyperv.hv_pa_pg);
+ kfree(to_kvm_hv(kvm)->hv_pa_pg);
vfree(kvm);
}
@@ -10406,7 +10531,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- return kvm_x86_ops.vm_init(kvm);
+ return static_call(kvm_x86_vm_init)(kvm);
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -10502,7 +10627,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
- return 0;
+ return NULL;
old_npages = slot->npages;
hva = slot->userspace_addr;
@@ -10535,8 +10660,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- u32 i;
-
if (current->mm == kvm->mm) {
/*
* Free memory regions allocated on behalf of userspace,
@@ -10551,10 +10674,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
- if (kvm_x86_ops.vm_destroy)
- kvm_x86_ops.vm_destroy(kvm);
- for (i = 0; i < kvm->arch.msr_filter.count; i++)
- kfree(kvm->arch.msr_filter.ranges[i].bitmap);
+ static_call_cond(kvm_x86_vm_destroy)(kvm);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
@@ -10562,6 +10683,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
+ kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
}
@@ -10680,75 +10802,96 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return 0;
}
+
+static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
+{
+ struct kvm_arch *ka = &kvm->arch;
+
+ if (!kvm_x86_ops.cpu_dirty_log_size)
+ return;
+
+ if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
+ (!enable && --ka->cpu_dirty_logging_count == 0))
+ kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
+
+ WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
+}
+
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *old,
struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
+ bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
+
/*
- * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
- * See comments below.
+ * Update CPU dirty logging if dirty logging is being toggled. This
+ * applies to all operations.
*/
- if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
- return;
+ if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)
+ kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
/*
- * Dirty logging tracks sptes in 4k granularity, meaning that large
- * sptes have to be split. If live migration is successful, the guest
- * in the source machine will be destroyed and large sptes will be
- * created in the destination. However, if the guest continues to run
- * in the source machine (for example if live migration fails), small
- * sptes will remain around and cause bad performance.
- *
- * Scan sptes if dirty logging has been stopped, dropping those
- * which can be collapsed into a single large-page spte. Later
- * page faults will create the large-page sptes.
+ * Nothing more to do for RO slots (which can't be dirtied and can't be
+ * made writable) or CREATE/MOVE/DELETE of a slot.
*
- * There is no need to do this in any of the following cases:
+ * For a memslot with dirty logging disabled:
* CREATE: No dirty mappings will already exist.
* MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot()
+ *
+ * For a memslot with dirty logging enabled:
+ * CREATE: No shadow pages exist, thus nothing to write-protect
+ * and no dirty bits to clear.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot().
*/
- if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
- kvm_mmu_zap_collapsible_sptes(kvm, new);
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ return;
/*
- * Enable or disable dirty logging for the slot.
- *
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
- * slot have been zapped so no dirty logging updates are needed for
- * the old slot.
- * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
- * any mappings that might be created in it will consume the
- * properties of the new slot and do not need to be updated here.
- *
- * When PML is enabled, the kvm_x86_ops dirty logging hooks are
- * called to enable/disable dirty logging.
- *
- * When disabling dirty logging with PML enabled, the D-bit is set
- * for sptes in the slot in order to prevent unnecessary GPA
- * logging in the PML buffer (and potential PML buffer full VMEXIT).
- * This guarantees leaving PML enabled for the guest's lifetime
- * won't have any additional overhead from PML when the guest is
- * running with dirty logging disabled.
- *
- * When enabling dirty logging, large sptes are write-protected
- * so they can be split on first write. New large sptes cannot
- * be created for this slot until the end of the logging.
- * See the comments in fast_page_fault().
- * For small sptes, nothing is done if the dirty log is in the
- * initial-all-set state. Otherwise, depending on whether pml
- * is enabled the D-bit or the W-bit will be cleared.
+ * READONLY and non-flags changes were filtered out above, and the only
+ * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
+ * logging isn't being toggled on or off.
*/
- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- if (kvm_x86_ops.slot_enable_log_dirty) {
- kvm_x86_ops.slot_enable_log_dirty(kvm, new);
- } else {
- int level =
- kvm_dirty_log_manual_protect_and_init_set(kvm) ?
- PG_LEVEL_2M : PG_LEVEL_4K;
+ if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES)))
+ return;
+
+ if (!log_dirty_pages) {
+ /*
+ * Dirty logging tracks sptes in 4k granularity, meaning that
+ * large sptes have to be split. If live migration succeeds,
+ * the guest in the source machine will be destroyed and large
+ * sptes will be created in the destination. However, if the
+ * guest continues to run in the source machine (for example if
+ * live migration fails), small sptes will remain around and
+ * cause bad performance.
+ *
+ * Scan sptes if dirty logging has been stopped, dropping those
+ * which can be collapsed into a single large-page spte. Later
+ * page faults will create the large-page sptes.
+ */
+ kvm_mmu_zap_collapsible_sptes(kvm, new);
+ } else {
+ /* By default, write-protect everything to log writes. */
+ int level = PG_LEVEL_4K;
+ if (kvm_x86_ops.cpu_dirty_log_size) {
+ /*
+ * Clear all dirty bits, unless pages are treated as
+ * dirty from the get-go.
+ */
+ if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+
+ /*
+ * Write-protect large pages on write so that dirty
+ * logging happens at 4k granularity. No need to
+ * write-protect small SPTEs since write accesses are
+ * logged by the CPU via dirty bits.
+ */
+ level = PG_LEVEL_2M;
+ } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
/*
* If we're with initial-all-set, we don't need
* to write protect any small page because
@@ -10757,11 +10900,9 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
* so that the page split can happen lazily on
* the first write to the huge page.
*/
- kvm_mmu_slot_remove_write_access(kvm, new, level);
+ level = PG_LEVEL_2M;
}
- } else {
- if (kvm_x86_ops.slot_disable_log_dirty)
- kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, level);
}
}
@@ -10800,7 +10941,7 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
kvm_x86_ops.guest_apic_has_interrupt &&
- kvm_x86_ops.guest_apic_has_interrupt(vcpu));
+ static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10819,12 +10960,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- kvm_x86_ops.nmi_allowed(vcpu, false)))
+ static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
- kvm_x86_ops.smi_allowed(vcpu, false)))
+ static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10858,7 +10999,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
+ if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
return true;
return false;
@@ -10876,7 +11017,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.interrupt_allowed(vcpu, false);
+ return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10902,7 +11043,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags;
- rflags = kvm_x86_ops.get_rflags(vcpu);
+ rflags = static_call(kvm_x86_get_rflags)(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -10914,7 +11055,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- kvm_x86_ops.set_rflags(vcpu, rflags);
+ static_call(kvm_x86_set_rflags)(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -11044,7 +11185,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
return false;
if (!kvm_pv_async_pf_enabled(vcpu) ||
- (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
+ (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0))
return false;
return true;
@@ -11189,7 +11330,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
@@ -11214,7 +11355,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
@@ -11225,7 +11366,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
+ return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 0f727b50bd3d..39eb04887141 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -98,7 +98,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
if (!is_long_mode(vcpu))
return false;
- kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
return cs_l;
}
@@ -129,7 +129,7 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops.tlb_flush_current(vcpu);
+ static_call(kvm_x86_tlb_flush_current)(vcpu);
}
static inline int is_pae(struct kvm_vcpu *vcpu)
@@ -244,9 +244,10 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
{
- return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu);
+ return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
}
+void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -273,6 +274,8 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
@@ -296,6 +299,8 @@ extern int pi_inject_timer;
extern struct static_key kvm_no_apic_vcpu;
+extern bool report_ignored_msrs;
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -391,7 +396,6 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
int kvm_spec_ctrl_test_value(u64 value);
bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu);
int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
new file mode 100644
index 000000000000..ae17250e1efe
--- /dev/null
+++ b/arch/x86/kvm/xen.c
@@ -0,0 +1,721 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#include "x86.h"
+#include "xen.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+#include <linux/sched/stat.h>
+
+#include <trace/events/kvm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "trace.h"
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
+
+static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+{
+ gpa_t gpa = gfn_to_gpa(gfn);
+ int wc_ofs, sec_hi_ofs;
+ int ret;
+ int idx = srcu_read_lock(&kvm->srcu);
+
+ ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
+ gpa, PAGE_SIZE);
+ if (ret)
+ goto out;
+
+ kvm->arch.xen.shinfo_set = true;
+
+ /* Paranoia checks on the 32-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
+ BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924);
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ /* 32-bit location by default */
+ wc_ofs = offsetof(struct compat_shared_info, wc);
+ sec_hi_ofs = offsetof(struct compat_shared_info, arch.wc_sec_hi);
+
+#ifdef CONFIG_X86_64
+ /* Paranoia checks on the 64-bit struct layout */
+ BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00);
+ BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c);
+
+ if (kvm->arch.xen.long_mode) {
+ wc_ofs = offsetof(struct shared_info, wc);
+ sec_hi_ofs = offsetof(struct shared_info, wc_sec_hi);
+ }
+#endif
+
+ kvm_write_wall_clock(kvm, gpa + wc_ofs, sec_hi_ofs - wc_ofs);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
+}
+
+static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ u64 now = get_kvmclock_ns(v->kvm);
+ u64 delta_ns = now - vx->runstate_entry_time;
+ u64 run_delay = current->sched_info.run_delay;
+
+ if (unlikely(!vx->runstate_entry_time))
+ vx->current_runstate = RUNSTATE_offline;
+
+ /*
+ * Time waiting for the scheduler isn't "stolen" if the
+ * vCPU wasn't running anyway.
+ */
+ if (vx->current_runstate == RUNSTATE_running) {
+ u64 steal_ns = run_delay - vx->last_steal;
+
+ delta_ns -= steal_ns;
+
+ vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+ }
+ vx->last_steal = run_delay;
+
+ vx->runstate_times[vx->current_runstate] += delta_ns;
+ vx->current_runstate = state;
+ vx->runstate_entry_time = now;
+}
+
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ uint64_t state_entry_time;
+ unsigned int offset;
+
+ kvm_xen_update_runstate(v, state);
+
+ if (!vx->runstate_set)
+ return;
+
+ BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
+
+ offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time);
+#ifdef CONFIG_X86_64
+ /*
+ * The only difference is alignment of uint64_t in 32-bit.
+ * So the first field 'state' is accessed directly using
+ * offsetof() (where its offset happens to be zero), while the
+ * remaining fields which are all uint64_t, start at 'offset'
+ * which we tweak here by adding 4.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) + 4);
+
+ if (v->kvm->arch.xen.long_mode)
+ offset = offsetof(struct vcpu_runstate_info, state_entry_time);
+#endif
+ /*
+ * First write the updated state_entry_time at the appropriate
+ * location determined by 'offset'.
+ */
+ state_entry_time = vx->runstate_entry_time;
+ state_entry_time |= XEN_RUNSTATE_UPDATE;
+
+ BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state_entry_time) !=
+ sizeof(state_entry_time));
+ BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state_entry_time) !=
+ sizeof(state_entry_time));
+
+ if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+ &state_entry_time, offset,
+ sizeof(state_entry_time)))
+ return;
+ smp_wmb();
+
+ /*
+ * Next, write the new runstate. This is in the *same* place
+ * for 32-bit and 64-bit guests, asserted here for paranoia.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
+ offsetof(struct compat_vcpu_runstate_info, state));
+ BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->state) !=
+ sizeof(vx->current_runstate));
+ BUILD_BUG_ON(sizeof(((struct compat_vcpu_runstate_info *)0)->state) !=
+ sizeof(vx->current_runstate));
+
+ if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+ &vx->current_runstate,
+ offsetof(struct vcpu_runstate_info, state),
+ sizeof(vx->current_runstate)))
+ return;
+
+ /*
+ * Write the actual runstate times immediately after the
+ * runstate_entry_time.
+ */
+ BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
+ offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+ BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+ sizeof(((struct compat_vcpu_runstate_info *)0)->time));
+ BUILD_BUG_ON(sizeof(((struct vcpu_runstate_info *)0)->time) !=
+ sizeof(vx->runstate_times));
+
+ if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+ &vx->runstate_times[0],
+ offset + sizeof(u64),
+ sizeof(vx->runstate_times)))
+ return;
+
+ smp_wmb();
+
+ /*
+ * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
+ * runstate_entry_time field.
+ */
+
+ state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+ if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache,
+ &state_entry_time, offset,
+ sizeof(state_entry_time)))
+ return;
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ u8 rc = 0;
+
+ /*
+ * If the global upcall vector (HVMIRQ_callback_vector) is set and
+ * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
+ */
+ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
+ struct kvm_memslots *slots = kvm_memslots(v->kvm);
+ unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
+
+ /* No need for compat handling here */
+ BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
+ offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct vcpu_info *)0)->evtchn_upcall_pending));
+ BUILD_BUG_ON(sizeof(rc) !=
+ sizeof(((struct compat_vcpu_info *)0)->evtchn_upcall_pending));
+
+ /*
+ * For efficiency, this mirrors the checks for using the valid
+ * cache in kvm_read_guest_offset_cached(), but just uses
+ * __get_user() instead. And falls back to the slow path.
+ */
+ if (likely(slots->generation == ghc->generation &&
+ !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
+ /* Fast path */
+ __get_user(rc, (u8 __user *)ghc->hva + offset);
+ } else {
+ /* Slow path */
+ kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+ sizeof(rc));
+ }
+
+ return rc;
+}
+
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
+ r = -EINVAL;
+ } else {
+ kvm->arch.xen.long_mode = !!data->u.long_mode;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (data->u.shared_info.gfn == GPA_INVALID) {
+ kvm->arch.xen.shinfo_set = false;
+ r = 0;
+ break;
+ }
+ r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ break;
+
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ kvm->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_ATTR_TYPE_LONG_MODE:
+ data->u.long_mode = kvm->arch.xen.long_mode;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ if (kvm->arch.xen.shinfo_set)
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
+ else
+ data->u.shared_info.gfn = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = kvm->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int idx, r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ /* No compat necessary here. */
+ BUILD_BUG_ON(sizeof(struct vcpu_info) !=
+ sizeof(struct compat_vcpu_info));
+ BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
+ offsetof(struct compat_vcpu_info, time));
+
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_info_set = false;
+ r = 0;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa,
+ sizeof(struct vcpu_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.vcpu_time_info_set = false;
+ r = 0;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
+ sizeof(struct pvclock_vcpu_time_info));
+ if (!r) {
+ vcpu->arch.xen.vcpu_time_info_set = true;
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.gpa == GPA_INVALID) {
+ vcpu->arch.xen.runstate_set = false;
+ r = 0;
+ break;
+ }
+
+ r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache,
+ data->u.gpa,
+ sizeof(struct vcpu_runstate_info));
+ if (!r) {
+ vcpu->arch.xen.runstate_set = true;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline) {
+ r = -EINVAL;
+ break;
+ }
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+ if (get_kvmclock_ns(vcpu->kvm) <
+ data->u.runstate.state_entry_time) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.current_runstate = data->u.runstate.state;
+ vcpu->arch.xen.runstate_entry_time =
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] =
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
+ data->u.runstate.time_offline;
+ vcpu->arch.xen.last_steal = current->sched_info.run_delay;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (data->u.runstate.state > RUNSTATE_offline &&
+ data->u.runstate.state != (u64)-1) {
+ r = -EINVAL;
+ break;
+ }
+ /* The adjustment must add up */
+ if (data->u.runstate.state_entry_time !=
+ (data->u.runstate.time_running +
+ data->u.runstate.time_runnable +
+ data->u.runstate.time_blocked +
+ data->u.runstate.time_offline)) {
+ r = -EINVAL;
+ break;
+ }
+
+ if (get_kvmclock_ns(vcpu->kvm) <
+ (vcpu->arch.xen.runstate_entry_time +
+ data->u.runstate.state_entry_time)) {
+ r = -EINVAL;
+ break;
+ }
+
+ vcpu->arch.xen.runstate_entry_time +=
+ data->u.runstate.state_entry_time;
+ vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
+ data->u.runstate.time_running;
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
+ data->u.runstate.time_runnable;
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
+ data->u.runstate.time_blocked;
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
+ data->u.runstate.time_offline;
+
+ if (data->u.runstate.state <= RUNSTATE_offline)
+ kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ r = 0;
+ break;
+
+ default:
+ break;
+ }
+
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
+{
+ int r = -ENOENT;
+
+ mutex_lock(&vcpu->kvm->lock);
+
+ switch (data->type) {
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ if (vcpu->arch.xen.vcpu_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
+ if (vcpu->arch.xen.vcpu_time_info_set)
+ data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
+ else
+ data->u.gpa = GPA_INVALID;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ if (vcpu->arch.xen.runstate_set) {
+ data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate.state = vcpu->arch.xen.current_runstate;
+ data->u.runstate.state_entry_time =
+ vcpu->arch.xen.runstate_entry_time;
+ data->u.runstate.time_running =
+ vcpu->arch.xen.runstate_times[RUNSTATE_running];
+ data->u.runstate.time_runnable =
+ vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
+ data->u.runstate.time_blocked =
+ vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
+ data->u.runstate.time_offline =
+ vcpu->arch.xen.runstate_times[RUNSTATE_offline];
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
+ r = -EINVAL;
+ break;
+
+ default:
+ break;
+ }
+
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+}
+
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u32 page_num = data & ~PAGE_MASK;
+ u64 page_addr = data & PAGE_MASK;
+ bool lm = is_long_mode(vcpu);
+
+ /* Latch long_mode for shared_info pages etc. */
+ vcpu->kvm->arch.xen.long_mode = lm;
+
+ /*
+ * If Xen hypercall intercept is enabled, fill the hypercall
+ * page with VMCALL/VMMCALL instructions since that's what
+ * we catch. Else the VMM has provided the hypercall pages
+ * with instructions of its own choosing, so use those.
+ */
+ if (kvm_xen_hypercall_enabled(kvm)) {
+ u8 instructions[32];
+ int i;
+
+ if (page_num)
+ return 1;
+
+ /* mov imm32, %eax */
+ instructions[0] = 0xb8;
+
+ /* vmcall / vmmcall */
+ kvm_x86_ops.patch_hypercall(vcpu, instructions + 5);
+
+ /* ret */
+ instructions[8] = 0xc3;
+
+ /* int3 to pad */
+ memset(instructions + 9, 0xcc, sizeof(instructions) - 9);
+
+ for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) {
+ *(u32 *)&instructions[1] = i;
+ if (kvm_vcpu_write_guest(vcpu,
+ page_addr + (i * sizeof(instructions)),
+ instructions, sizeof(instructions)))
+ return 1;
+ }
+ } else {
+ /*
+ * Note, truncation is a non-issue as 'lm' is guaranteed to be
+ * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
+ */
+ hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
+ : kvm->arch.xen_hvm_config.blob_addr_32;
+ u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
+ : kvm->arch.xen_hvm_config.blob_size_32;
+ u8 *page;
+
+ if (page_num >= blob_size)
+ return 1;
+
+ blob_addr += page_num * PAGE_SIZE;
+
+ page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
+{
+ if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ return -EINVAL;
+
+ /*
+ * With hypercall interception the kernel generates its own
+ * hypercall page so it must not be provided.
+ */
+ if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
+ (xhc->blob_addr_32 || xhc->blob_addr_64 ||
+ xhc->blob_size_32 || xhc->blob_size_64))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
+ static_branch_inc(&kvm_xen_enabled.key);
+ else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+
+ memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
+
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
+
+static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+ kvm_rax_write(vcpu, result);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+ struct kvm_run *run = vcpu->run;
+
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
+ return 1;
+
+ return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
+}
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
+{
+ bool longmode;
+ u64 input, params[6];
+
+ input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+ /* Hyper-V hypercalls get bit 31 set in EAX */
+ if ((input & 0x80000000) &&
+ kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ longmode = is_64_bit_mode(vcpu);
+ if (!longmode) {
+ params[0] = (u32)kvm_rbx_read(vcpu);
+ params[1] = (u32)kvm_rcx_read(vcpu);
+ params[2] = (u32)kvm_rdx_read(vcpu);
+ params[3] = (u32)kvm_rsi_read(vcpu);
+ params[4] = (u32)kvm_rdi_read(vcpu);
+ params[5] = (u32)kvm_rbp_read(vcpu);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ params[0] = (u64)kvm_rdi_read(vcpu);
+ params[1] = (u64)kvm_rsi_read(vcpu);
+ params[2] = (u64)kvm_rdx_read(vcpu);
+ params[3] = (u64)kvm_r10_read(vcpu);
+ params[4] = (u64)kvm_r8_read(vcpu);
+ params[5] = (u64)kvm_r9_read(vcpu);
+ }
+#endif
+ trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ params[3], params[4], params[5]);
+
+ vcpu->run->exit_reason = KVM_EXIT_XEN;
+ vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
+ vcpu->run->xen.u.hcall.longmode = longmode;
+ vcpu->run->xen.u.hcall.cpl = kvm_x86_ops.get_cpl(vcpu);
+ vcpu->run->xen.u.hcall.input = input;
+ vcpu->run->xen.u.hcall.params[0] = params[0];
+ vcpu->run->xen.u.hcall.params[1] = params[1];
+ vcpu->run->xen.u.hcall.params[2] = params[2];
+ vcpu->run->xen.u.hcall.params[3] = params[3];
+ vcpu->run->xen.u.hcall.params[4] = params[4];
+ vcpu->run->xen.u.hcall.params[5] = params[5];
+ vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io =
+ kvm_xen_hypercall_complete_userspace;
+
+ return 0;
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
new file mode 100644
index 000000000000..463a7844a8ca
--- /dev/null
+++ b/arch/x86/kvm/xen.h
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
+ * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * KVM Xen emulation
+ */
+
+#ifndef __ARCH_X86_KVM_XEN_H__
+#define __ARCH_X86_KVM_XEN_H__
+
+#ifdef CONFIG_KVM_XEN
+#include <linux/jump_label_ratelimit.h>
+
+extern struct static_key_false_deferred kvm_xen_enabled;
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
+int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
+int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_destroy_vm(struct kvm *kvm);
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ kvm->arch.xen_hvm_config.msr;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ (kvm->arch.xen_hvm_config.flags &
+ KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ return __kvm_xen_has_interrupt(vcpu);
+
+ return 0;
+}
+#else
+static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
+{
+ return 1;
+}
+
+static inline void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+}
+
+static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+#endif
+
+int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
+
+#include <asm/pvclock-abi.h>
+#include <asm/xen/interface.h>
+#include <xen/interface/vcpu.h>
+
+void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+
+static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
+{
+ kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+}
+
+static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
+{
+ /*
+ * If the vCPU wasn't preempted but took a normal exit for
+ * some reason (hypercalls, I/O, etc.), that is accounted as
+ * still RUNSTATE_running, as the VMM is still operating on
+ * behalf of the vCPU. Only if the VMM does actually block
+ * does it need to enter RUNSTATE_blocked.
+ */
+ if (vcpu->preempted)
+ kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+}
+
+/* 32-bit compatibility definitions, also used natively in 32-bit build */
+struct compat_arch_vcpu_info {
+ unsigned int cr2;
+ unsigned int pad[5];
+};
+
+struct compat_vcpu_info {
+ uint8_t evtchn_upcall_pending;
+ uint8_t evtchn_upcall_mask;
+ uint16_t pad;
+ uint32_t evtchn_pending_sel;
+ struct compat_arch_vcpu_info arch;
+ struct pvclock_vcpu_time_info time;
+}; /* 64 bytes (x86) */
+
+struct compat_arch_shared_info {
+ unsigned int max_pfn;
+ unsigned int pfn_to_mfn_frame_list_list;
+ unsigned int nmi_reason;
+ unsigned int p2m_cr3;
+ unsigned int p2m_vaddr;
+ unsigned int p2m_generation;
+ uint32_t wc_sec_hi;
+};
+
+struct compat_shared_info {
+ struct compat_vcpu_info vcpu_info[MAX_VIRT_CPUS];
+ uint32_t evtchn_pending[32];
+ uint32_t evtchn_mask[32];
+ struct pvclock_wall_clock wc;
+ struct compat_arch_shared_info arch;
+};
+
+struct compat_vcpu_runstate_info {
+ int state;
+ uint64_t state_entry_time;
+ uint64_t time[4];
+} __attribute__((packed));
+
+#endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 4229950a5d78..bb0b3fe1e0a0 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1415,6 +1415,25 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
+static unsigned long insn_get_effective_ip(struct pt_regs *regs)
+{
+ unsigned long seg_base = 0;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs)) {
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+ if (seg_base == -1L)
+ return 0;
+ }
+
+ return seg_base + regs->ip;
+}
+
/**
* insn_fetch_from_user() - Copy instruction bytes from user-space memory
* @regs: Structure with register values as seen when entering kernel mode
@@ -1431,24 +1450,43 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
- unsigned long seg_base = 0;
+ unsigned long ip;
int not_copied;
- /*
- * If not in user-space long mode, a custom code segment could be in
- * use. This is true in protected mode (if the process defined a local
- * descriptor table), or virtual-8086 mode. In most of the cases
- * seg_base will be zero as in USER_CS.
- */
- if (!user_64bit_mode(regs)) {
- seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
- if (seg_base == -1L)
- return 0;
- }
+ ip = insn_get_effective_ip(regs);
+ if (!ip)
+ return 0;
+
+ not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
+ return MAX_INSN_SIZE - not_copied;
+}
+
+/**
+ * insn_fetch_from_user_inatomic() - Copy instruction bytes from user-space memory
+ * while in atomic code
+ * @regs: Structure with register values as seen when entering kernel mode
+ * @buf: Array to store the fetched instruction
+ *
+ * Gets the linear address of the instruction and copies the instruction bytes
+ * to the buf. This function must be used in atomic context.
+ *
+ * Returns:
+ *
+ * Number of instruction bytes copied.
+ *
+ * 0 if nothing was copied.
+ */
+int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
+{
+ unsigned long ip;
+ int not_copied;
+
+ ip = insn_get_effective_ip(regs);
+ if (!ip)
+ return 0;
- not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
- MAX_INSN_SIZE);
+ not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
return MAX_INSN_SIZE - not_copied;
}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 404279563891..435630a6ec97 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -5,6 +5,7 @@
* Copyright (C) IBM Corporation, 2002, 2004, 2009
*/
+#include <linux/kernel.h>
#ifdef __KERNEL__
#include <linux/string.h>
#else
@@ -15,15 +16,28 @@
#include <asm/emulate_prefix.h>
+#define leXX_to_cpu(t, r) \
+({ \
+ __typeof__(t) v; \
+ switch (sizeof(t)) { \
+ case 4: v = le32_to_cpu(r); break; \
+ case 2: v = le16_to_cpu(r); break; \
+ case 1: v = r; break; \
+ default: \
+ BUILD_BUG(); break; \
+ } \
+ v; \
+})
+
/* Verify next sizeof(t) bytes can be on the same instruction */
#define validate_next(t, insn, n) \
((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
- ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
+ ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); })
#define __peek_nbyte_next(t, insn, n) \
- ({ t r = *(t*)((insn)->next_byte + n); r; })
+ ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); })
#define get_next(t, insn) \
({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); })
@@ -147,9 +161,9 @@ found:
b = insn->prefixes.bytes[3];
for (i = 0; i < nb; i++)
if (prefixes->bytes[i] == lb)
- prefixes->bytes[i] = b;
+ insn_set_byte(prefixes, i, b);
}
- insn->prefixes.bytes[3] = lb;
+ insn_set_byte(&insn->prefixes, 3, lb);
}
/* Decode REX prefix */
@@ -157,8 +171,7 @@ found:
b = peek_next(insn_byte_t, insn);
attr = inat_get_opcode_attribute(b);
if (inat_is_rex_prefix(attr)) {
- insn->rex_prefix.value = b;
- insn->rex_prefix.nbytes = 1;
+ insn_field_set(&insn->rex_prefix, b, 1);
insn->next_byte++;
if (X86_REX_W(b))
/* REX.W overrides opnd_size */
@@ -181,13 +194,13 @@ found:
if (X86_MODRM_MOD(b2) != 3)
goto vex_end;
}
- insn->vex_prefix.bytes[0] = b;
- insn->vex_prefix.bytes[1] = b2;
+ insn_set_byte(&insn->vex_prefix, 0, b);
+ insn_set_byte(&insn->vex_prefix, 1, b2);
if (inat_is_evex_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
b2 = peek_nbyte_next(insn_byte_t, insn, 3);
- insn->vex_prefix.bytes[3] = b2;
+ insn_set_byte(&insn->vex_prefix, 3, b2);
insn->vex_prefix.nbytes = 4;
insn->next_byte += 4;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -195,7 +208,7 @@ found:
insn->opnd_bytes = 8;
} else if (inat_is_vex3_prefix(attr)) {
b2 = peek_nbyte_next(insn_byte_t, insn, 2);
- insn->vex_prefix.bytes[2] = b2;
+ insn_set_byte(&insn->vex_prefix, 2, b2);
insn->vex_prefix.nbytes = 3;
insn->next_byte += 3;
if (insn->x86_64 && X86_VEX_W(b2))
@@ -207,7 +220,7 @@ found:
* Makes it easier to decode vex.W, vex.vvvv,
* vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
*/
- insn->vex_prefix.bytes[2] = b2 & 0x7f;
+ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f);
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
@@ -243,7 +256,7 @@ void insn_get_opcode(struct insn *insn)
/* Get first opcode */
op = get_next(insn_byte_t, insn);
- opcode->bytes[0] = op;
+ insn_set_byte(opcode, 0, op);
opcode->nbytes = 1;
/* Check if there is VEX prefix or not */
@@ -295,8 +308,7 @@ void insn_get_modrm(struct insn *insn)
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
- modrm->value = mod;
- modrm->nbytes = 1;
+ insn_field_set(modrm, mod, 1);
if (inat_is_group(insn->attr)) {
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
@@ -334,7 +346,7 @@ int insn_rip_relative(struct insn *insn)
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
*/
- return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
+ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5);
}
/**
@@ -353,11 +365,11 @@ void insn_get_sib(struct insn *insn)
if (!insn->modrm.got)
insn_get_modrm(insn);
if (insn->modrm.nbytes) {
- modrm = (insn_byte_t)insn->modrm.value;
+ modrm = insn->modrm.bytes[0];
if (insn->addr_bytes != 2 &&
X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
- insn->sib.value = get_next(insn_byte_t, insn);
- insn->sib.nbytes = 1;
+ insn_field_set(&insn->sib,
+ get_next(insn_byte_t, insn), 1);
}
}
insn->sib.got = 1;
@@ -407,19 +419,18 @@ void insn_get_displacement(struct insn *insn)
if (mod == 3)
goto out;
if (mod == 1) {
- insn->displacement.value = get_next(signed char, insn);
- insn->displacement.nbytes = 1;
+ insn_field_set(&insn->displacement,
+ get_next(signed char, insn), 1);
} else if (insn->addr_bytes == 2) {
if ((mod == 0 && rm == 6) || mod == 2) {
- insn->displacement.value =
- get_next(short, insn);
- insn->displacement.nbytes = 2;
+ insn_field_set(&insn->displacement,
+ get_next(short, insn), 2);
}
} else {
if ((mod == 0 && rm == 5) || mod == 2 ||
(mod == 0 && base == 5)) {
- insn->displacement.value = get_next(int, insn);
- insn->displacement.nbytes = 4;
+ insn_field_set(&insn->displacement,
+ get_next(int, insn), 4);
}
}
}
@@ -435,18 +446,14 @@ static int __get_moffset(struct insn *insn)
{
switch (insn->addr_bytes) {
case 2:
- insn->moffset1.value = get_next(short, insn);
- insn->moffset1.nbytes = 2;
+ insn_field_set(&insn->moffset1, get_next(short, insn), 2);
break;
case 4:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
break;
case 8:
- insn->moffset1.value = get_next(int, insn);
- insn->moffset1.nbytes = 4;
- insn->moffset2.value = get_next(int, insn);
- insn->moffset2.nbytes = 4;
+ insn_field_set(&insn->moffset1, get_next(int, insn), 4);
+ insn_field_set(&insn->moffset2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -464,13 +471,11 @@ static int __get_immv32(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case 4:
case 8:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -487,18 +492,15 @@ static int __get_immv(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
insn->immediate1.nbytes = 4;
break;
case 8:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
default: /* opnd_bytes must be modified manually */
goto err_out;
@@ -515,12 +517,10 @@ static int __get_immptr(struct insn *insn)
{
switch (insn->opnd_bytes) {
case 2:
- insn->immediate1.value = get_next(short, insn);
- insn->immediate1.nbytes = 2;
+ insn_field_set(&insn->immediate1, get_next(short, insn), 2);
break;
case 4:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
break;
case 8:
/* ptr16:64 is not exist (no segment) */
@@ -528,8 +528,7 @@ static int __get_immptr(struct insn *insn)
default: /* opnd_bytes must be modified manually */
goto err_out;
}
- insn->immediate2.value = get_next(unsigned short, insn);
- insn->immediate2.nbytes = 2;
+ insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2);
insn->immediate1.got = insn->immediate2.got = 1;
return 1;
@@ -565,22 +564,17 @@ void insn_get_immediate(struct insn *insn)
switch (inat_immediate_size(insn->attr)) {
case INAT_IMM_BYTE:
- insn->immediate.value = get_next(signed char, insn);
- insn->immediate.nbytes = 1;
+ insn_field_set(&insn->immediate, get_next(signed char, insn), 1);
break;
case INAT_IMM_WORD:
- insn->immediate.value = get_next(short, insn);
- insn->immediate.nbytes = 2;
+ insn_field_set(&insn->immediate, get_next(short, insn), 2);
break;
case INAT_IMM_DWORD:
- insn->immediate.value = get_next(int, insn);
- insn->immediate.nbytes = 4;
+ insn_field_set(&insn->immediate, get_next(int, insn), 4);
break;
case INAT_IMM_QWORD:
- insn->immediate1.value = get_next(int, insn);
- insn->immediate1.nbytes = 4;
- insn->immediate2.value = get_next(int, insn);
- insn->immediate2.nbytes = 4;
+ insn_field_set(&insn->immediate1, get_next(int, insn), 4);
+ insn_field_set(&insn->immediate2, get_next(int, insn), 4);
break;
case INAT_IMM_PTR:
if (!__get_immptr(insn))
@@ -599,8 +593,7 @@ void insn_get_immediate(struct insn *insn)
goto err_out;
}
if (inat_has_second_immediate(insn->attr)) {
- insn->immediate2.value = get_next(signed char, insn);
- insn->immediate2.nbytes = 1;
+ insn_field_set(&insn->immediate2, get_next(signed char, insn), 1);
}
done:
insn->immediate.got = 1;
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index b4c43a9b1483..f6fb1d218dcc 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -28,7 +28,7 @@ SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
jmp .Lspec_trap_\@
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
- UNWIND_HINT_RET_OFFSET
+ UNWIND_HINT_FUNC
ret
SYM_FUNC_END(__x86_retpoline_\reg)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f1f1b5a0956a..a73347e2cdfc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -9,6 +9,7 @@
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/extable.h> /* search_exception_tables */
#include <linux/memblock.h> /* max_low_pfn */
+#include <linux/kfence.h> /* kfence_handle_page_fault */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
@@ -16,7 +17,7 @@
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
-#include <linux/efi.h> /* efi_recover_from_page_fault()*/
+#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -25,7 +26,7 @@
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
@@ -54,7 +55,7 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
* 32-bit mode:
*
* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
- * Check that here and ignore it.
+ * Check that here and ignore it. This is AMD erratum #91.
*
* 64-bit mode:
*
@@ -83,11 +84,7 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
#ifdef CONFIG_X86_64
case 0x40:
/*
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
- * Need to figure out under what instruction mode the
- * instruction was issued. Could check the LDT for lm,
- * but for now it's good enough to assume that long
- * mode only uses well known segments or kernel.
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
*/
return (!user_mode(regs) || user_64bit_mode(regs));
#endif
@@ -110,6 +107,15 @@ check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
}
}
+static bool is_amd_k8_pre_npt(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ return unlikely(IS_ENABLED(CONFIG_CPU_SUP_AMD) &&
+ c->x86_vendor == X86_VENDOR_AMD &&
+ c->x86 == 0xf && c->x86_model < 0x40);
+}
+
static int
is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
{
@@ -117,6 +123,10 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
unsigned char *instr;
int prefetch = 0;
+ /* Erratum #91 affects AMD K8, pre-NPT CPUs */
+ if (!is_amd_k8_pre_npt())
+ return 0;
+
/*
* If it was a exec (instruction fetch) fault on NX page, then
* do not ignore the fault:
@@ -127,20 +137,31 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
- return 0;
+ /*
+ * This code has historically always bailed out if IP points to a
+ * not-present page (e.g. due to a race). No one has ever
+ * complained about this.
+ */
+ pagefault_disable();
while (instr < max_instr) {
unsigned char opcode;
- if (get_kernel_nofault(opcode, instr))
- break;
+ if (user_mode(regs)) {
+ if (get_user(opcode, instr))
+ break;
+ } else {
+ if (get_kernel_nofault(opcode, instr))
+ break;
+ }
instr++;
if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
break;
}
+
+ pagefault_enable();
return prefetch;
}
@@ -262,25 +283,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-/*
- * Did it hit the DOS screen memory VA from vm86 mode?
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-#ifdef CONFIG_VM86
- unsigned long bit;
-
- if (!v8086_mode(regs) || !tsk->thread.vm86)
- return;
-
- bit = (address - 0xA0000) >> PAGE_SHIFT;
- if (bit < 32)
- tsk->thread.vm86->screen_bitmap |= 1 << bit;
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
@@ -335,15 +337,6 @@ KERN_ERR
"******* Disabling USB legacy in the BIOS may also help.\n";
#endif
-/*
- * No vm86 mode in 64-bit mode:
- */
-static inline void
-check_v8086_mode(struct pt_regs *regs, unsigned long address,
- struct task_struct *tsk)
-{
-}
-
static int bad_address(void *p)
{
unsigned long dummy;
@@ -427,6 +420,9 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
|| boot_cpu_data.x86 != 0xf)
return 0;
+ if (user_mode(regs))
+ return 0;
+
if (address != regs->ip)
return 0;
@@ -462,10 +458,12 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
}
/* Pentium F0 0F C7 C8 bug workaround: */
-static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+static int is_f00f_bug(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
#ifdef CONFIG_X86_F00F_BUG
- if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
+ if (boot_cpu_has_bug(X86_BUG_F00F) && !(error_code & X86_PF_USER) &&
+ idt_is_f00f_address(address)) {
handle_invalid_op(regs);
return 1;
}
@@ -630,53 +628,20 @@ static void set_signal_archinfo(unsigned long address,
}
static noinline void
-no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int signal, int si_code)
+page_fault_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address)
{
- struct task_struct *tsk = current;
unsigned long flags;
int sig;
if (user_mode(regs)) {
/*
- * This is an implicit supervisor-mode access from user
- * mode. Bypass all the kernel-mode recovery code and just
- * OOPS.
+ * Implicit kernel access from user mode? Skip the stack
+ * overflow and EFI special cases.
*/
goto oops;
}
- /* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
- /*
- * Any interrupt that takes a fault gets the fixup. This makes
- * the below recursive fault logic only apply to a faults from
- * task context.
- */
- if (in_interrupt())
- return;
-
- /*
- * Per the above we're !in_interrupt(), aka. task context.
- *
- * In this case we need to make sure we're not recursively
- * faulting through the emulate_vsyscall() logic.
- */
- if (current->thread.sig_on_uaccess_err && signal) {
- sanitize_error_code(address, &error_code);
-
- set_signal_archinfo(address, error_code);
-
- /* XXX: hwpoison faults will set the wrong code. */
- force_sig_fault(signal, si_code, (void __user *)address);
- }
-
- /*
- * Barring that, we can do the fixup and be happy.
- */
- return;
- }
-
#ifdef CONFIG_VMAP_STACK
/*
* Stack overflow? During boot, we can fault near the initial
@@ -684,8 +649,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* that we're in vmalloc space to avoid this.
*/
if (is_vmalloc_addr((void *)address) &&
- (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
- address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ (((unsigned long)current->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)current->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -709,29 +674,18 @@ no_context(struct pt_regs *regs, unsigned long error_code,
#endif
/*
- * 32-bit:
- *
- * Valid to do another page fault here, because if this fault
- * had been triggered by is_prefetch fixup_exception would have
- * handled it.
- *
- * 64-bit:
- *
- * Hall of shame of CPU/BIOS bugs.
+ * Buggy firmware could access regions which might page fault. If
+ * this happens, EFI has a special OOPS path that will try to
+ * avoid hanging the system.
*/
- if (is_prefetch(regs, error_code, address))
- return;
+ if (IS_ENABLED(CONFIG_EFI))
+ efi_crash_gracefully_on_page_fault(address);
- if (is_errata93(regs, address))
+ /* Only not-present faults should be handled by KFENCE. */
+ if (!(error_code & X86_PF_PROT) &&
+ kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
return;
- /*
- * Buggy firmware could access regions which might page fault, try to
- * recover from such faults.
- */
- if (IS_ENABLED(CONFIG_EFI))
- efi_recover_from_page_fault(address);
-
oops:
/*
* Oops. The kernel tried to access some bad page. We'll have to
@@ -741,7 +695,7 @@ oops:
show_fault_oops(regs, error_code, address);
- if (task_stack_end_corrupted(tsk))
+ if (task_stack_end_corrupted(current))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
sig = SIGKILL;
@@ -754,6 +708,53 @@ oops:
oops_end(flags, regs, sig);
}
+static noinline void
+kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code,
+ unsigned long address, int signal, int si_code)
+{
+ WARN_ON_ONCE(user_mode(regs));
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
+ /*
+ * Any interrupt that takes a fault gets the fixup. This makes
+ * the below recursive fault logic only apply to a faults from
+ * task context.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
+ * Per the above we're !in_interrupt(), aka. task context.
+ *
+ * In this case we need to make sure we're not recursively
+ * faulting through the emulate_vsyscall() logic.
+ */
+ if (current->thread.sig_on_uaccess_err && signal) {
+ sanitize_error_code(address, &error_code);
+
+ set_signal_archinfo(address, error_code);
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_fault(signal, si_code, (void __user *)address);
+ }
+
+ /*
+ * Barring that, we can do the fixup and be happy.
+ */
+ return;
+ }
+
+ /*
+ * AMD erratum #91 manifests as a spurious page fault on a PREFETCH
+ * instruction.
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+}
+
/*
* Print out info about fatal segfaults, if the show_unhandled_signals
* sysctl is set:
@@ -796,47 +797,49 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
{
struct task_struct *tsk = current;
- /* User mode accesses just cause a SIGSEGV */
- if (user_mode(regs) && (error_code & X86_PF_USER)) {
- /*
- * It's possible to have interrupts off here:
- */
- local_irq_enable();
-
- /*
- * Valid to do another page fault here because this one came
- * from user space:
- */
- if (is_prefetch(regs, error_code, address))
- return;
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code);
+ return;
+ }
- if (is_errata100(regs, address))
- return;
+ if (!(error_code & X86_PF_USER)) {
+ /* Implicit user access to kernel memory -- just oops */
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
- sanitize_error_code(address, &error_code);
+ /*
+ * User mode accesses just cause a SIGSEGV.
+ * It's possible to have interrupts off here:
+ */
+ local_irq_enable();
- if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
- return;
+ /*
+ * Valid to do another page fault here because this one came
+ * from user space:
+ */
+ if (is_prefetch(regs, error_code, address))
+ return;
- if (likely(show_unhandled_signals))
- show_signal_msg(regs, error_code, address, tsk);
+ if (is_errata100(regs, address))
+ return;
- set_signal_archinfo(address, error_code);
+ sanitize_error_code(address, &error_code);
- if (si_code == SEGV_PKUERR)
- force_sig_pkuerr((void __user *)address, pkey);
+ if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
+ return;
- force_sig_fault(SIGSEGV, si_code, (void __user *)address);
+ if (likely(show_unhandled_signals))
+ show_signal_msg(regs, error_code, address, tsk);
- local_irq_disable();
+ set_signal_archinfo(address, error_code);
- return;
- }
+ if (si_code == SEGV_PKUERR)
+ force_sig_pkuerr((void __user *)address, pkey);
- if (is_f00f_bug(regs, address))
- return;
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
- no_context(regs, error_code, address, SIGSEGV, si_code);
+ local_irq_disable();
}
static noinline void
@@ -926,8 +929,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
vm_fault_t fault)
{
/* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -961,40 +964,6 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
}
-static noinline void
-mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, vm_fault_t fault)
-{
- if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address, 0, 0);
- return;
- }
-
- if (fault & VM_FAULT_OOM) {
- /* Kernel mode? Handle exceptions or die: */
- if (!(error_code & X86_PF_USER)) {
- no_context(regs, error_code, address,
- SIGSEGV, SEGV_MAPERR);
- return;
- }
-
- /*
- * We ran out of memory, call the OOM killer, and return the
- * userspace (which will retry the fault, or kill us if we got
- * oom-killed):
- */
- pagefault_out_of_memory();
- } else {
- if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
- VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
- else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
- else
- BUG();
- }
-}
-
static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
@@ -1209,6 +1178,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
#endif
+ if (is_f00f_bug(regs, hw_error_code, address))
+ return;
+
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1229,10 +1201,17 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
}
NOKPROBE_SYMBOL(do_kern_addr_fault);
-/* Handle faults in the user portion of the address space */
+/*
+ * Handle faults in the user portion of the address space. Nothing in here
+ * should check X86_PF_USER without a specific justification: for almost
+ * all purposes, we should treat a normal kernel access to user memory
+ * (e.g. get_user(), put_user(), etc.) the same as the WRUSS instruction.
+ * The one exception is AC flag handling, which is, per the x86
+ * architecture, special for WRUSS.
+ */
static inline
void do_user_addr_fault(struct pt_regs *regs,
- unsigned long hw_error_code,
+ unsigned long error_code,
unsigned long address)
{
struct vm_area_struct *vma;
@@ -1244,6 +1223,21 @@ void do_user_addr_fault(struct pt_regs *regs,
tsk = current;
mm = tsk->mm;
+ if (unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
+ /*
+ * Whoops, this is kernel mode code trying to execute from
+ * user memory. Unless this is AMD erratum #93, which
+ * corrupts RIP such that it looks like a user address,
+ * this is unrecoverable. Don't even try to look up the
+ * VMA or look for extable entries.
+ */
+ if (is_errata93(regs, address))
+ return;
+
+ page_fault_oops(regs, error_code, address);
+ return;
+ }
+
/* kprobes don't want to hook the spurious faults: */
if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
@@ -1252,8 +1246,8 @@ void do_user_addr_fault(struct pt_regs *regs,
* Reserved bits are never expected to be set on
* entries in the user portion of the page tables.
*/
- if (unlikely(hw_error_code & X86_PF_RSVD))
- pgtable_bad(regs, hw_error_code, address);
+ if (unlikely(error_code & X86_PF_RSVD))
+ pgtable_bad(regs, error_code, address);
/*
* If SMAP is on, check for invalid kernel (supervisor) access to user
@@ -1263,10 +1257,13 @@ void do_user_addr_fault(struct pt_regs *regs,
* enforcement appears to be consistent with the USER bit.
*/
if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
- !(hw_error_code & X86_PF_USER) &&
- !(regs->flags & X86_EFLAGS_AC)))
- {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ !(error_code & X86_PF_USER) &&
+ !(regs->flags & X86_EFLAGS_AC))) {
+ /*
+ * No extable entry here. This was a kernel access to an
+ * invalid pointer. get_kernel_nofault() will not get here.
+ */
+ page_fault_oops(regs, error_code, address);
return;
}
@@ -1275,7 +1272,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
@@ -1296,9 +1293,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (hw_error_code & X86_PF_WRITE)
+ if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (hw_error_code & X86_PF_INSTR)
+ if (error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1314,7 +1311,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* to consider the PF_PK bit.
*/
if (is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(hw_error_code, regs, address))
+ if (emulate_vsyscall(error_code, regs, address))
return;
}
#endif
@@ -1337,7 +1334,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, hw_error_code, address);
+ bad_area_nosemaphore(regs, error_code, address);
return;
}
retry:
@@ -1353,17 +1350,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, hw_error_code, address);
+ bad_area(regs, error_code, address);
return;
}
@@ -1372,8 +1369,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(hw_error_code, vma))) {
- bad_area_access_error(regs, hw_error_code, address, vma);
+ if (unlikely(access_error(error_code, vma))) {
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1392,11 +1389,14 @@ good_area:
*/
fault = handle_mm_fault(vma, address, flags, regs);
- /* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
+ /*
+ * Quick path to respond to signals. The core mm code
+ * has unlocked the mm for us if we get here.
+ */
if (!user_mode(regs))
- no_context(regs, hw_error_code, address, SIGBUS,
- BUS_ADRERR);
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR);
return;
}
@@ -1412,12 +1412,37 @@ good_area:
}
mmap_read_unlock(mm);
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, hw_error_code, address, fault);
+ if (likely(!(fault & VM_FAULT_ERROR)))
+ return;
+
+ if (fatal_signal_pending(current) && !user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address, 0, 0);
return;
}
- check_v8086_mode(regs, address, tsk);
+ if (fault & VM_FAULT_OOM) {
+ /* Kernel mode? Handle exceptions or die: */
+ if (!user_mode(regs)) {
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
+ return;
+ }
+
+ /*
+ * We ran out of memory, call the OOM killer, and return the
+ * userspace (which will retry the fault, or kill us if we got
+ * oom-killed):
+ */
+ pagefault_out_of_memory();
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+ do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
+ else
+ BUG();
+ }
}
NOKPROBE_SYMBOL(do_user_addr_fault);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e26f5c5c6565..dd694fb93916 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -157,16 +157,25 @@ __ref void *alloc_low_pages(unsigned int num)
}
/*
- * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
- * With KASLR memory randomization, depending on the machine e820 memory
- * and the PUD alignment. We may need twice more pages when KASLR memory
+ * By default need to be able to allocate page tables below PGD firstly for
+ * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
+ * With KASLR memory randomization, depending on the machine e820 memory and the
+ * PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled.
*/
+
+#ifndef CONFIG_X86_5LEVEL
+#define INIT_PGD_PAGE_TABLES 3
+#else
+#define INIT_PGD_PAGE_TABLES 4
+#endif
+
#ifndef CONFIG_RANDOMIZE_MEMORY
-#define INIT_PGD_PAGE_COUNT 6
+#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else
-#define INIT_PGD_PAGE_COUNT 12
+#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif
+
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index c3d5f0236f35..4b01f7dbaf30 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -475,9 +475,10 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, we need to unroll the rep string I/O instructions.
+ * With SEV, we need to unroll the rep string I/O instructions,
+ * but SEV-ES supports them through the #VC handler.
*/
- if (sev_active())
+ if (sev_active() && !sev_es_active())
static_branch_enable(&sev_enable_key);
print_mem_encrypt_feature_info();
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index bd7aff5c51f7..cd768dafca9e 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -10,8 +10,6 @@
#define pr_fmt(fmt) "mmiotrace: " fmt
-#define DEBUG 1
-
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 8f665c352bf0..ca311aaa67b8 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1164,12 +1164,14 @@ static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ kfree(v);
++*pos;
return memtype_get_idx(*pos);
}
static void memtype_seq_stop(struct seq_file *seq, void *v)
{
+ kfree(v);
}
static int memtype_seq_show(struct seq_file *seq, void *v)
@@ -1181,8 +1183,6 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
entry_print->end,
cattr_name(entry_print->type));
- kfree(entry_print);
-
return 0;
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 796506dcfc42..6926d0ca6c71 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -205,6 +205,18 @@ static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
+/* Some 1-byte opcodes for binary ALU operations */
+static u8 simple_alu_opcodes[] = {
+ [BPF_ADD] = 0x01,
+ [BPF_SUB] = 0x29,
+ [BPF_AND] = 0x21,
+ [BPF_OR] = 0x09,
+ [BPF_XOR] = 0x31,
+ [BPF_LSH] = 0xE0,
+ [BPF_RSH] = 0xE8,
+ [BPF_ARSH] = 0xF8,
+};
+
static void jit_fill_hole(void *area, unsigned int size)
{
/* Fill whole space with INT3 instructions */
@@ -681,6 +693,42 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
*pprog = prog;
}
+/* Emit the suffix (ModR/M etc) for addressing *(ptr_reg + off) and val_reg */
+static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is_imm8(off)) {
+ /* 1-byte signed displacement.
+ *
+ * If off == 0 we could skip this and save one extra byte, but
+ * special case of x86 R13 which always needs an offset is not
+ * worth the hassle
+ */
+ EMIT2(add_2reg(0x40, ptr_reg, val_reg), off);
+ } else {
+ /* 4-byte signed displacement */
+ EMIT1_off32(add_2reg(0x80, ptr_reg, val_reg), off);
+ }
+ *pprog = prog;
+}
+
+/*
+ * Emit a REX byte if it will be necessary to address these registers
+ */
+static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (is64)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ *pprog = prog;
+}
+
/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
@@ -708,15 +756,7 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
break;
}
- /*
- * If insn->off == 0 we can save one extra byte, but
- * special case of x86 R13 which always needs an offset
- * is not worth the hassle
- */
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
+ emit_insn_suffix(&prog, src_reg, dst_reg, off);
*pprog = prog;
}
@@ -751,11 +791,51 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
break;
}
- if (is_imm8(off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+ *pprog = prog;
+}
+
+static int emit_atomic(u8 **pprog, u8 atomic_op,
+ u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ EMIT1(0xF0); /* lock prefix */
+
+ maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW);
+
+ /* emit opcode */
+ switch (atomic_op) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_AND:
+ case BPF_OR:
+ case BPF_XOR:
+ /* lock *(u32/u64*)(dst_reg + off) <op>= src_reg */
+ EMIT1(simple_alu_opcodes[atomic_op]);
+ break;
+ case BPF_ADD | BPF_FETCH:
+ /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */
+ EMIT2(0x0F, 0xC1);
+ break;
+ case BPF_XCHG:
+ /* src_reg = atomic_xchg(dst_reg + off, src_reg); */
+ EMIT1(0x87);
+ break;
+ case BPF_CMPXCHG:
+ /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */
+ EMIT2(0x0F, 0xB1);
+ break;
+ default:
+ pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+ return -EFAULT;
+ }
+
+ emit_insn_suffix(&prog, dst_reg, src_reg, off);
+
*pprog = prog;
+ return 0;
}
static bool ex_handler_bpf(const struct exception_table_entry *x,
@@ -789,8 +869,31 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
+static int emit_nops(u8 **pprog, int len)
+{
+ u8 *prog = *pprog;
+ int i, noplen, cnt = 0;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(ideal_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+
+ return cnt;
+}
+
+#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
- int oldproglen, struct jit_context *ctx)
+ int oldproglen, struct jit_context *ctx, bool jmp_padding)
{
bool tail_call_reachable = bpf_prog->aux->tail_call_reachable;
struct bpf_insn *insn = bpf_prog->insnsi;
@@ -800,8 +903,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i, cnt = 0, excnt = 0;
- int proglen = 0;
+ int ilen, proglen = 0;
u8 *prog = temp;
+ int err;
detect_reg_usage(insn, insn_cnt, callee_regs_used,
&tail_call_seen);
@@ -813,17 +917,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_prog->aux->func_idx != 0);
push_callee_regs(&prog, callee_regs_used);
- addrs[0] = prog - temp;
+
+ ilen = prog - temp;
+ if (image)
+ memcpy(image + proglen, temp, ilen);
+ proglen += ilen;
+ addrs[0] = proglen;
+ prog = temp;
for (i = 1; i <= insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
u32 dst_reg = insn->dst_reg;
u32 src_reg = insn->src_reg;
u8 b2 = 0, b3 = 0;
+ u8 *start_of_ldx;
s64 jmp_offset;
u8 jmp_cond;
- int ilen;
u8 *func;
+ int nops;
switch (insn->code) {
/* ALU */
@@ -837,17 +948,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_ALU64 | BPF_AND | BPF_X:
case BPF_ALU64 | BPF_OR | BPF_X:
case BPF_ALU64 | BPF_XOR | BPF_X:
- switch (BPF_OP(insn->code)) {
- case BPF_ADD: b2 = 0x01; break;
- case BPF_SUB: b2 = 0x29; break;
- case BPF_AND: b2 = 0x21; break;
- case BPF_OR: b2 = 0x09; break;
- case BPF_XOR: b2 = 0x31; break;
- }
- if (BPF_CLASS(insn->code) == BPF_ALU64)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_ALU64);
+ b2 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
break;
@@ -1027,12 +1130,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
-
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
if (imm32 == 1)
EMIT2(0xD1, add_1reg(b3, dst_reg));
else
@@ -1066,11 +1164,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
else if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
- switch (BPF_OP(insn->code)) {
- case BPF_LSH: b3 = 0xE0; break;
- case BPF_RSH: b3 = 0xE8; break;
- case BPF_ARSH: b3 = 0xF8; break;
- }
+ b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
if (src_reg != BPF_REG_4)
@@ -1185,12 +1279,30 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
+ /* test src_reg, src_reg */
+ maybe_emit_mod(&prog, src_reg, src_reg, true); /* always 1 byte */
+ EMIT2(0x85, add_2reg(0xC0, src_reg, src_reg));
+ /* jne start_of_ldx */
+ EMIT2(X86_JNE, 0);
+ /* xor dst_reg, dst_reg */
+ emit_mov_imm32(&prog, false, dst_reg, 0);
+ /* jmp byte_after_ldx */
+ EMIT2(0xEB, 0);
+
+ /* populate jmp_offset for JNE above */
+ temp[4] = prog - temp - 5 /* sizeof(test + jne) */;
+ start_of_ldx = prog;
+ }
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen;
s64 delta;
+ /* populate jmp_offset for JMP above */
+ start_of_ldx[-1] = prog - start_of_ldx;
+
if (!bpf_prog->aux->extable)
break;
@@ -1230,21 +1342,60 @@ st: if (is_imm8(insn->off))
}
break;
- /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */
- case BPF_STX | BPF_XADD | BPF_W:
- /* Emit 'lock add dword ptr [rax + off], eax' */
- if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01);
- else
- EMIT2(0xF0, 0x01);
- goto xadd;
- case BPF_STX | BPF_XADD | BPF_DW:
- EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01);
-xadd: if (is_imm8(insn->off))
- EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
- else
- EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
- insn->off);
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
+ if (insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH)) {
+ u8 *branch_target;
+ bool is64 = BPF_SIZE(insn->code) == BPF_DW;
+ u32 real_src_reg = src_reg;
+
+ /*
+ * Can't be implemented with a single x86 insn.
+ * Need to do a CMPXCHG loop.
+ */
+
+ /* Will need RAX as a CMPXCHG operand so save R0 */
+ emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0);
+ if (src_reg == BPF_REG_0)
+ real_src_reg = BPF_REG_AX;
+
+ branch_target = prog;
+ /* Load old value */
+ emit_ldx(&prog, BPF_SIZE(insn->code),
+ BPF_REG_0, dst_reg, insn->off);
+ /*
+ * Perform the (commutative) operation locally,
+ * put the result in the AUX_REG.
+ */
+ emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0);
+ maybe_emit_mod(&prog, AUX_REG, real_src_reg, is64);
+ EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)],
+ add_2reg(0xC0, AUX_REG, real_src_reg));
+ /* Attempt to swap in new value */
+ err = emit_atomic(&prog, BPF_CMPXCHG,
+ dst_reg, AUX_REG, insn->off,
+ BPF_SIZE(insn->code));
+ if (WARN_ON(err))
+ return err;
+ /*
+ * ZF tells us whether we won the race. If it's
+ * cleared we need to try again.
+ */
+ EMIT2(X86_JNE, -(prog - branch_target) - 2);
+ /* Return the pre-modification value */
+ emit_mov_reg(&prog, is64, real_src_reg, BPF_REG_0);
+ /* Restore R0 after clobbering RAX */
+ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX);
+ break;
+
+ }
+
+ err = emit_atomic(&prog, insn->imm, dst_reg, src_reg,
+ insn->off, BPF_SIZE(insn->code));
+ if (err)
+ return err;
break;
/* call */
@@ -1295,20 +1446,16 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSGE | BPF_X:
case BPF_JMP32 | BPF_JSLE | BPF_X:
/* cmp dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_X:
case BPF_JMP32 | BPF_JSET | BPF_X:
/* test dst_reg, src_reg */
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, src_reg));
- else if (is_ereg(dst_reg) || is_ereg(src_reg))
- EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ maybe_emit_mod(&prog, dst_reg, src_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
@@ -1344,10 +1491,8 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP32 | BPF_JSLE | BPF_K:
/* test dst_reg, dst_reg to save one extra byte */
if (imm32 == 0) {
- if (BPF_CLASS(insn->code) == BPF_JMP)
- EMIT1(add_2mod(0x48, dst_reg, dst_reg));
- else if (is_ereg(dst_reg))
- EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ maybe_emit_mod(&prog, dst_reg, dst_reg,
+ BPF_CLASS(insn->code) == BPF_JMP);
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
goto emit_cond_jmp;
}
@@ -1409,6 +1554,30 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
}
jmp_offset = addrs[i + insn->off] - addrs[i];
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To keep the jmp_offset valid, the extra bytes are
+ * padded before the jump insn, so we substract the
+ * 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp_cond, then this BPF insn won't shrink, so
+ * "nops" is 0.
+ *
+ * On the other hand, if the previous pass emits an
+ * imm32 jmp_cond, the extra 4 bytes(*) is padded to
+ * keep the image from shrinking further.
+ *
+ * (*) imm32 jmp_cond is 6 bytes, and imm8 jmp_cond
+ * is 2 bytes, so the size difference is 4 bytes.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 4) {
+ pr_err("unexpected jmp_cond padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
EMIT2(jmp_cond, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
@@ -1431,11 +1600,55 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
else
jmp_offset = addrs[i + insn->off] - addrs[i];
- if (!jmp_offset)
- /* Optimize out nop jumps */
+ if (!jmp_offset) {
+ /*
+ * If jmp_padding is enabled, the extra nops will
+ * be inserted. Otherwise, optimize out nop jumps.
+ */
+ if (jmp_padding) {
+ /* There are 3 possible conditions.
+ * (1) This BPF_JA is already optimized out in
+ * the previous run, so there is no need
+ * to pad any extra byte (0 byte).
+ * (2) The previous pass emits an imm8 jmp,
+ * so we pad 2 bytes to match the previous
+ * insn size.
+ * (3) Similarly, the previous pass emits an
+ * imm32 jmp, and 5 bytes is padded.
+ */
+ nops = INSN_SZ_DIFF;
+ if (nops != 0 && nops != 2 && nops != 5) {
+ pr_err("unexpected nop jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, nops);
+ }
break;
+ }
emit_jmp:
if (is_imm8(jmp_offset)) {
+ if (jmp_padding) {
+ /* To avoid breaking jmp_offset, the extra bytes
+ * are padded before the actual jmp insn, so
+ * 2 bytes is substracted from INSN_SZ_DIFF.
+ *
+ * If the previous pass already emits an imm8
+ * jmp, there is nothing to pad (0 byte).
+ *
+ * If it emits an imm32 jmp (5 bytes) previously
+ * and now an imm8 jmp (2 bytes), then we pad
+ * (5 - 2 = 3) bytes to stop the image from
+ * shrinking further.
+ */
+ nops = INSN_SZ_DIFF - 2;
+ if (nops != 0 && nops != 3) {
+ pr_err("unexpected jump padding: %d bytes\n",
+ nops);
+ return -EFAULT;
+ }
+ cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
+ }
EMIT2(0xEB, jmp_offset);
} else if (is_simm32(jmp_offset)) {
EMIT1_off32(0xE9, jmp_offset);
@@ -1531,17 +1744,25 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_prog *p, int stack_size, bool mod_ret)
{
u8 *prog = *pprog;
+ u8 *jmp_insn;
int cnt = 0;
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_enter_sleepable, prog))
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_enter_sleepable :
+ __bpf_prog_enter, prog))
return -EINVAL;
- } else {
- if (emit_call(&prog, __bpf_prog_enter, prog))
- return -EINVAL;
- /* remember prog start time returned by __bpf_prog_enter */
- emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
- }
+ /* remember prog start time returned by __bpf_prog_enter */
+ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
+
+ /* if (__bpf_prog_enter*(prog) == 0)
+ * goto skip_exec_of_prog;
+ */
+ EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */
+ /* emit 2 nops that will be replaced with JE insn */
+ jmp_insn = prog;
+ emit_nops(&prog, 2);
/* arg1: lea rdi, [rbp - stack_size] */
EMIT4(0x48, 0x8D, 0x7D, -stack_size);
@@ -1561,43 +1782,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (mod_ret)
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
- if (p->aux->sleepable) {
- if (emit_call(&prog, __bpf_prog_exit_sleepable, prog))
- return -EINVAL;
- } else {
- /* arg1: mov rdi, progs[i] */
- emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32,
- (u32) (long) p);
- /* arg2: mov rsi, rbx <- start time in nsec */
- emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
- if (emit_call(&prog, __bpf_prog_exit, prog))
+ /* replace 2 nops with JE insn, since jmp target is known */
+ jmp_insn[0] = X86_JE;
+ jmp_insn[1] = prog - jmp_insn - 2;
+
+ /* arg1: mov rdi, progs[i] */
+ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
+ /* arg2: mov rsi, rbx <- start time in nsec */
+ emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
+ if (emit_call(&prog,
+ p->aux->sleepable ? __bpf_prog_exit_sleepable :
+ __bpf_prog_exit, prog))
return -EINVAL;
- }
*pprog = prog;
return 0;
}
-static void emit_nops(u8 **pprog, unsigned int len)
-{
- unsigned int i, noplen;
- u8 *prog = *pprog;
- int cnt = 0;
-
- while (len > 0) {
- noplen = len;
-
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
-
- for (i = 0; i < noplen; i++)
- EMIT1(ideal_nops[noplen][i]);
- len -= noplen;
- }
-
- *pprog = prog;
-}
-
static void emit_align(u8 **pprog, u32 align)
{
u8 *target, *prog = *pprog;
@@ -1972,6 +2173,9 @@ struct x64_jit_data {
struct jit_context ctx;
};
+#define MAX_PASSES 20
+#define PADDING_PASSES (MAX_PASSES - 5)
+
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
@@ -1981,6 +2185,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
struct jit_context ctx = {};
bool tmp_blinded = false;
bool extra_pass = false;
+ bool padding = false;
u8 *image = NULL;
int *addrs;
int pass;
@@ -2017,6 +2222,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
image = jit_data->image;
header = jit_data->header;
extra_pass = true;
+ padding = true;
goto skip_init_addrs;
}
addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
@@ -2042,8 +2248,10 @@ skip_init_addrs:
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
- for (pass = 0; pass < 20 || image; pass++) {
- proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ for (pass = 0; pass < MAX_PASSES || image; pass++) {
+ if (!padding && pass >= PADDING_PASSES)
+ padding = true;
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx, padding);
if (proglen <= 0) {
out_image:
image = NULL;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 96fde03aa987..d17b67c69f89 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2243,10 +2243,8 @@ emit_jmp:
return -EFAULT;
}
break;
- /* STX XADD: lock *(u32 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_W:
- /* STX XADD: lock *(u64 *)(dst + off) += src */
- case BPF_STX | BPF_XADD | BPF_DW:
+ case BPF_STX | BPF_ATOMIC | BPF_W:
+ case BPF_STX | BPF_ATOMIC | BPF_DW:
goto notyet;
case BPF_JMP | BPF_EXIT:
if (seen_exit) {
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
deleted file mode 100644
index 4d49b5a27025..000000000000
--- a/arch/x86/oprofile/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_OPROFILE) += oprofile.o
-
-DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
- oprof.o cpu_buffer.o buffer_sync.o \
- event_buffer.o oprofile_files.o \
- oprofilefs.o oprofile_stats.o \
- timer_int.o nmi_timer_int.o )
-
-oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
-oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
- op_model_ppro.o op_model_p4.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
deleted file mode 100644
index 1d8391fcca68..000000000000
--- a/arch/x86/oprofile/backtrace.c
+++ /dev/null
@@ -1,127 +0,0 @@
-/**
- * @file backtrace.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author David Smith
- */
-
-#include <linux/oprofile.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-
-#include <asm/ptrace.h>
-#include <asm/stacktrace.h>
-#include <asm/unwind.h>
-
-#ifdef CONFIG_COMPAT
-static struct stack_frame_ia32 *
-dump_user_backtrace_32(struct stack_frame_ia32 *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame_ia32 bufhead[2];
- struct stack_frame_ia32 *fp;
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= fp)
- return NULL;
-
- return fp;
-}
-
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame_ia32 *head;
-
- /* User process is IA32 */
- if (!current || user_64bit_mode(regs))
- return 0;
-
- head = (struct stack_frame_ia32 *) regs->bp;
- while (depth-- && head)
- head = dump_user_backtrace_32(head);
-
- return 1;
-}
-
-#else
-static inline int
-x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
-{
- return 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
-{
- /* Also check accessibility of one struct frame_head beyond: */
- struct stack_frame bufhead[2];
- unsigned long bytes;
-
- bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
- if (bytes != 0)
- return NULL;
-
- oprofile_add_trace(bufhead[0].return_address);
-
- /* frame pointers should strictly progress back up the stack
- * (towards higher addresses) */
- if (head >= bufhead[0].next_frame)
- return NULL;
-
- return bufhead[0].next_frame;
-}
-
-void
-x86_backtrace(struct pt_regs * const regs, unsigned int depth)
-{
- struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
-
- if (!user_mode(regs)) {
- struct unwind_state state;
- unsigned long addr;
-
- if (!depth)
- return;
-
- oprofile_add_trace(regs->ip);
-
- if (!--depth)
- return;
-
- for (unwind_start(&state, current, regs, NULL);
- !unwind_done(&state); unwind_next_frame(&state)) {
- addr = unwind_get_return_address(&state);
- if (!addr)
- break;
-
- oprofile_add_trace(addr);
-
- if (!--depth)
- break;
- }
-
- return;
- }
-
- if (x86_backtrace_32(regs, depth))
- return;
-
- while (depth-- && head)
- head = dump_user_backtrace(head);
-}
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
deleted file mode 100644
index 9e138d00ad36..000000000000
--- a/arch/x86/oprofile/init.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/**
- * @file init.c
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- */
-
-#include <linux/oprofile.h>
-#include <linux/init.h>
-#include <linux/errno.h>
-
-/*
- * We support CPUs that have performance counters like the Pentium Pro
- * with the NMI mode driver.
- */
-
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int op_nmi_init(struct oprofile_operations *ops);
-extern void op_nmi_exit(void);
-#else
-static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
-static void op_nmi_exit(void) { }
-#endif
-
-extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
-
-int __init oprofile_arch_init(struct oprofile_operations *ops)
-{
- ops->backtrace = x86_backtrace;
- return op_nmi_init(ops);
-}
-
-void oprofile_arch_exit(void)
-{
- op_nmi_exit();
-}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
deleted file mode 100644
index a7a7677265b6..000000000000
--- a/arch/x86/oprofile/nmi_int.c
+++ /dev/null
@@ -1,780 +0,0 @@
-/**
- * @file nmi_int.c
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon <levon@movementarian.org>
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/oprofile.h>
-#include <linux/syscore_ops.h>
-#include <linux/slab.h>
-#include <linux/moduleparam.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-
-#include "op_counter.h"
-#include "op_x86_model.h"
-
-static struct op_x86_model_spec *model;
-static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
-static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
-
-/* must be protected with get_online_cpus()/put_online_cpus(): */
-static int nmi_enabled;
-static int ctr_running;
-
-struct op_counter_config counter_config[OP_MAX_COUNTER];
-
-/* common functions */
-
-u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config)
-{
- u64 val = 0;
- u16 event = (u16)counter_config->event;
-
- val |= ARCH_PERFMON_EVENTSEL_INT;
- val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
- val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
- val |= (counter_config->unit_mask & 0xFF) << 8;
- counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_CMASK);
- val |= counter_config->extra;
- event &= model->event_mask ? model->event_mask : 0xFF;
- val |= event & 0xFF;
- val |= (u64)(event & 0x0F00) << 24;
-
- return val;
-}
-
-
-static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
-{
- if (ctr_running)
- model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
- else if (!nmi_enabled)
- return NMI_DONE;
- else
- model->stop(this_cpu_ptr(&cpu_msrs));
- return NMI_HANDLED;
-}
-
-static void nmi_cpu_save_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- rdmsrl(counters[i].addr, counters[i].saved);
- }
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- rdmsrl(controls[i].addr, controls[i].saved);
- }
-}
-
-static void nmi_cpu_start(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->start(msrs);
-}
-
-static int nmi_start(void)
-{
- get_online_cpus();
- ctr_running = 1;
- /* make ctr_running visible to the nmi handler: */
- smp_mb();
- on_each_cpu(nmi_cpu_start, NULL, 1);
- put_online_cpus();
- return 0;
-}
-
-static void nmi_cpu_stop(void *dummy)
-{
- struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
- if (!msrs->controls)
- WARN_ON_ONCE(1);
- else
- model->stop(msrs);
-}
-
-static void nmi_stop(void)
-{
- get_online_cpus();
- on_each_cpu(nmi_cpu_stop, NULL, 1);
- ctr_running = 0;
- put_online_cpus();
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static DEFINE_PER_CPU(int, switch_index);
-
-static inline int has_mux(void)
-{
- return !!model->switch_ctrl;
-}
-
-inline int op_x86_phys_to_virt(int phys)
-{
- return __this_cpu_read(switch_index) + phys;
-}
-
-inline int op_x86_virt_to_phys(int virt)
-{
- return virt % model->num_counters;
-}
-
-static void nmi_shutdown_mux(void)
-{
- int i;
-
- if (!has_mux())
- return;
-
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).multiplex);
- per_cpu(cpu_msrs, i).multiplex = NULL;
- per_cpu(switch_index, i) = 0;
- }
-}
-
-static int nmi_setup_mux(void)
-{
- size_t multiplex_size =
- sizeof(struct op_msr) * model->num_virt_counters;
- int i;
-
- if (!has_mux())
- return 1;
-
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).multiplex =
- kzalloc(multiplex_size, GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).multiplex)
- return 0;
- }
-
- return 1;
-}
-
-static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
-{
- int i;
- struct op_msr *multiplex = msrs->multiplex;
-
- if (!has_mux())
- return;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- if (counter_config[i].enabled) {
- multiplex[i].saved = -(u64)counter_config[i].count;
- } else {
- multiplex[i].saved = 0;
- }
- }
-
- per_cpu(switch_index, cpu) = 0;
-}
-
-static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- rdmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *multiplex = msrs->multiplex;
- int i;
-
- for (i = 0; i < model->num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (counters[i].addr)
- wrmsrl(counters[i].addr, multiplex[virt].saved);
- }
-}
-
-static void nmi_cpu_switch(void *dummy)
-{
- int cpu = smp_processor_id();
- int si = per_cpu(switch_index, cpu);
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_stop(NULL);
- nmi_cpu_save_mpx_registers(msrs);
-
- /* move to next set */
- si += model->num_counters;
- if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
- per_cpu(switch_index, cpu) = 0;
- else
- per_cpu(switch_index, cpu) = si;
-
- model->switch_ctrl(model, msrs);
- nmi_cpu_restore_mpx_registers(msrs);
-
- nmi_cpu_start(NULL);
-}
-
-
-/*
- * Quick check to see if multiplexing is necessary.
- * The check should be sufficient since counters are used
- * in ordre.
- */
-static int nmi_multiplex_on(void)
-{
- return counter_config[model->num_counters].count ? 0 : -EINVAL;
-}
-
-static int nmi_switch_event(void)
-{
- if (!has_mux())
- return -ENOSYS; /* not implemented */
- if (nmi_multiplex_on() < 0)
- return -EINVAL; /* not necessary */
-
- get_online_cpus();
- if (ctr_running)
- on_each_cpu(nmi_cpu_switch, NULL, 1);
- put_online_cpus();
-
- return 0;
-}
-
-static inline void mux_init(struct oprofile_operations *ops)
-{
- if (has_mux())
- ops->switch_events = nmi_switch_event;
-}
-
-static void mux_clone(int cpu)
-{
- if (!has_mux())
- return;
-
- memcpy(per_cpu(cpu_msrs, cpu).multiplex,
- per_cpu(cpu_msrs, 0).multiplex,
- sizeof(struct op_msr) * model->num_virt_counters);
-}
-
-#else
-
-inline int op_x86_phys_to_virt(int phys) { return phys; }
-inline int op_x86_virt_to_phys(int virt) { return virt; }
-static inline void nmi_shutdown_mux(void) { }
-static inline int nmi_setup_mux(void) { return 1; }
-static inline void
-nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { }
-static inline void mux_init(struct oprofile_operations *ops) { }
-static void mux_clone(int cpu) { }
-
-#endif
-
-static void free_msrs(void)
-{
- int i;
- for_each_possible_cpu(i) {
- kfree(per_cpu(cpu_msrs, i).counters);
- per_cpu(cpu_msrs, i).counters = NULL;
- kfree(per_cpu(cpu_msrs, i).controls);
- per_cpu(cpu_msrs, i).controls = NULL;
- }
- nmi_shutdown_mux();
-}
-
-static int allocate_msrs(void)
-{
- size_t controls_size = sizeof(struct op_msr) * model->num_controls;
- size_t counters_size = sizeof(struct op_msr) * model->num_counters;
-
- int i;
- for_each_possible_cpu(i) {
- per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).counters)
- goto fail;
- per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
- GFP_KERNEL);
- if (!per_cpu(cpu_msrs, i).controls)
- goto fail;
- }
-
- if (!nmi_setup_mux())
- goto fail;
-
- return 1;
-
-fail:
- free_msrs();
- return 0;
-}
-
-static void nmi_cpu_setup(void)
-{
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- nmi_cpu_save_registers(msrs);
- raw_spin_lock(&oprofilefs_lock);
- model->setup_ctrs(model, msrs);
- nmi_cpu_setup_mux(cpu, msrs);
- raw_spin_unlock(&oprofilefs_lock);
- per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static void nmi_cpu_restore_registers(struct op_msrs *msrs)
-{
- struct op_msr *counters = msrs->counters;
- struct op_msr *controls = msrs->controls;
- unsigned int i;
-
- for (i = 0; i < model->num_controls; ++i) {
- if (controls[i].addr)
- wrmsrl(controls[i].addr, controls[i].saved);
- }
-
- for (i = 0; i < model->num_counters; ++i) {
- if (counters[i].addr)
- wrmsrl(counters[i].addr, counters[i].saved);
- }
-}
-
-static void nmi_cpu_shutdown(void)
-{
- unsigned int v;
- int cpu = smp_processor_id();
- struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
-
- /* restoring APIC_LVTPC can trigger an apic error because the delivery
- * mode and vector nr combination can be illegal. That's by design: on
- * power on apic lvt contain a zero vector nr which are legal only for
- * NMI delivery mode. So inhibit apic err before restoring lvtpc
- */
- v = apic_read(APIC_LVTERR);
- apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
- apic_write(APIC_LVTERR, v);
- nmi_cpu_restore_registers(msrs);
-}
-
-static int nmi_cpu_online(unsigned int cpu)
-{
- local_irq_disable();
- if (nmi_enabled)
- nmi_cpu_setup();
- if (ctr_running)
- nmi_cpu_start(NULL);
- local_irq_enable();
- return 0;
-}
-
-static int nmi_cpu_down_prep(unsigned int cpu)
-{
- local_irq_disable();
- if (ctr_running)
- nmi_cpu_stop(NULL);
- if (nmi_enabled)
- nmi_cpu_shutdown();
- local_irq_enable();
- return 0;
-}
-
-static int nmi_create_files(struct dentry *root)
-{
- unsigned int i;
-
- for (i = 0; i < model->num_virt_counters; ++i) {
- struct dentry *dir;
- char buf[4];
-
- /* quick little hack to _not_ expose a counter if it is not
- * available for use. This should protect userspace app.
- * NOTE: assumes 1:1 mapping here (that counters are organized
- * sequentially in their struct assignment).
- */
- if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i)))
- continue;
-
- snprintf(buf, sizeof(buf), "%d", i);
- dir = oprofilefs_mkdir(root, buf);
- oprofilefs_create_ulong(dir, "enabled", &counter_config[i].enabled);
- oprofilefs_create_ulong(dir, "event", &counter_config[i].event);
- oprofilefs_create_ulong(dir, "count", &counter_config[i].count);
- oprofilefs_create_ulong(dir, "unit_mask", &counter_config[i].unit_mask);
- oprofilefs_create_ulong(dir, "kernel", &counter_config[i].kernel);
- oprofilefs_create_ulong(dir, "user", &counter_config[i].user);
- oprofilefs_create_ulong(dir, "extra", &counter_config[i].extra);
- }
-
- return 0;
-}
-
-static enum cpuhp_state cpuhp_nmi_online;
-
-static int nmi_setup(void)
-{
- int err = 0;
- int cpu;
-
- if (!allocate_msrs())
- return -ENOMEM;
-
- /* We need to serialize save and setup for HT because the subset
- * of msrs are distinct for save and setup operations
- */
-
- /* Assume saved/restored counters are the same on all CPUs */
- err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
- if (err)
- goto fail;
-
- for_each_possible_cpu(cpu) {
- if (!IS_ENABLED(CONFIG_SMP) || !cpu)
- continue;
-
- memcpy(per_cpu(cpu_msrs, cpu).counters,
- per_cpu(cpu_msrs, 0).counters,
- sizeof(struct op_msr) * model->num_counters);
-
- memcpy(per_cpu(cpu_msrs, cpu).controls,
- per_cpu(cpu_msrs, 0).controls,
- sizeof(struct op_msr) * model->num_controls);
-
- mux_clone(cpu);
- }
-
- nmi_enabled = 0;
- ctr_running = 0;
- /* make variables visible to the nmi handler: */
- smp_mb();
- err = register_nmi_handler(NMI_LOCAL, profile_exceptions_notify,
- 0, "oprofile");
- if (err)
- goto fail;
-
- nmi_enabled = 1;
- /* make nmi_enabled visible to the nmi handler: */
- smp_mb();
- err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online",
- nmi_cpu_online, nmi_cpu_down_prep);
- if (err < 0)
- goto fail_nmi;
- cpuhp_nmi_online = err;
- return 0;
-fail_nmi:
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
-fail:
- free_msrs();
- return err;
-}
-
-static void nmi_shutdown(void)
-{
- struct op_msrs *msrs;
-
- cpuhp_remove_state(cpuhp_nmi_online);
- nmi_enabled = 0;
- ctr_running = 0;
-
- /* make variables visible to the nmi handler: */
- smp_mb();
- unregister_nmi_handler(NMI_LOCAL, "oprofile");
- msrs = &get_cpu_var(cpu_msrs);
- model->shutdown(msrs);
- free_msrs();
- put_cpu_var(cpu_msrs);
-}
-
-#ifdef CONFIG_PM
-
-static int nmi_suspend(void)
-{
- /* Only one CPU left, just stop that one */
- if (nmi_enabled == 1)
- nmi_cpu_stop(NULL);
- return 0;
-}
-
-static void nmi_resume(void)
-{
- if (nmi_enabled == 1)
- nmi_cpu_start(NULL);
-}
-
-static struct syscore_ops oprofile_syscore_ops = {
- .resume = nmi_resume,
- .suspend = nmi_suspend,
-};
-
-static void __init init_suspend_resume(void)
-{
- register_syscore_ops(&oprofile_syscore_ops);
-}
-
-static void exit_suspend_resume(void)
-{
- unregister_syscore_ops(&oprofile_syscore_ops);
-}
-
-#else
-
-static inline void init_suspend_resume(void) { }
-static inline void exit_suspend_resume(void) { }
-
-#endif /* CONFIG_PM */
-
-static int __init p4_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
-
- if (cpu_model > 6 || cpu_model == 5)
- return 0;
-
-#ifndef CONFIG_SMP
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-#else
- switch (smp_num_siblings) {
- case 1:
- *cpu_type = "i386/p4";
- model = &op_p4_spec;
- return 1;
-
- case 2:
- *cpu_type = "i386/p4-ht";
- model = &op_p4_ht2_spec;
- return 1;
- }
-#endif
-
- printk(KERN_INFO "oprofile: P4 HyperThreading detected with > 2 threads\n");
- printk(KERN_INFO "oprofile: Reverting to timer mode.\n");
- return 0;
-}
-
-enum __force_cpu_type {
- reserved = 0, /* do not force */
- timer,
- arch_perfmon,
-};
-
-static int force_cpu_type;
-
-static int set_cpu_type(const char *str, const struct kernel_param *kp)
-{
- if (!strcmp(str, "timer")) {
- force_cpu_type = timer;
- printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
- } else if (!strcmp(str, "arch_perfmon")) {
- force_cpu_type = arch_perfmon;
- printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
- } else {
- force_cpu_type = 0;
- }
-
- return 0;
-}
-module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
-
-static int __init ppro_init(char **cpu_type)
-{
- __u8 cpu_model = boot_cpu_data.x86_model;
- struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
-
- if (force_cpu_type == arch_perfmon && boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return 0;
-
- /*
- * Documentation on identifying Intel processors by CPU family
- * and model can be found in the Intel Software Developer's
- * Manuals (SDM):
- *
- * http://www.intel.com/products/processor/manuals/
- *
- * As of May 2010 the documentation for this was in the:
- * "Intel 64 and IA-32 Architectures Software Developer's
- * Manual Volume 3B: System Programming Guide", "Table B-1
- * CPUID Signature Values of DisplayFamily_DisplayModel".
- */
- switch (cpu_model) {
- case 0 ... 2:
- *cpu_type = "i386/ppro";
- break;
- case 3 ... 5:
- *cpu_type = "i386/pii";
- break;
- case 6 ... 8:
- case 10 ... 11:
- *cpu_type = "i386/piii";
- break;
- case 9:
- case 13:
- *cpu_type = "i386/p6_mobile";
- break;
- case 14:
- *cpu_type = "i386/core";
- break;
- case 0x0f:
- case 0x16:
- case 0x17:
- case 0x1d:
- *cpu_type = "i386/core_2";
- break;
- case 0x1a:
- case 0x1e:
- case 0x2e:
- spec = &op_arch_perfmon_spec;
- *cpu_type = "i386/core_i7";
- break;
- case 0x1c:
- *cpu_type = "i386/atom";
- break;
- default:
- /* Unknown */
- return 0;
- }
-
- model = spec;
- return 1;
-}
-
-int __init op_nmi_init(struct oprofile_operations *ops)
-{
- __u8 vendor = boot_cpu_data.x86_vendor;
- __u8 family = boot_cpu_data.x86;
- char *cpu_type = NULL;
- int ret = 0;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return -ENODEV;
-
- if (force_cpu_type == timer)
- return -ENODEV;
-
- switch (vendor) {
- case X86_VENDOR_AMD:
- /* Needs to be at least an Athlon (or hammer in 32bit mode) */
-
- switch (family) {
- case 6:
- cpu_type = "i386/athlon";
- break;
- case 0xf:
- /*
- * Actually it could be i386/hammer too, but
- * give user space an consistent name.
- */
- cpu_type = "x86-64/hammer";
- break;
- case 0x10:
- cpu_type = "x86-64/family10";
- break;
- case 0x11:
- cpu_type = "x86-64/family11h";
- break;
- case 0x12:
- cpu_type = "x86-64/family12h";
- break;
- case 0x14:
- cpu_type = "x86-64/family14h";
- break;
- case 0x15:
- cpu_type = "x86-64/family15h";
- break;
- default:
- return -ENODEV;
- }
- model = &op_amd_spec;
- break;
-
- case X86_VENDOR_INTEL:
- switch (family) {
- /* Pentium IV */
- case 0xf:
- p4_init(&cpu_type);
- break;
-
- /* A P6-class processor */
- case 6:
- ppro_init(&cpu_type);
- break;
-
- default:
- break;
- }
-
- if (cpu_type)
- break;
-
- if (!boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
- return -ENODEV;
-
- /* use arch perfmon as fallback */
- cpu_type = "i386/arch_perfmon";
- model = &op_arch_perfmon_spec;
- break;
-
- default:
- return -ENODEV;
- }
-
- /* default values, can be overwritten by model */
- ops->create_files = nmi_create_files;
- ops->setup = nmi_setup;
- ops->shutdown = nmi_shutdown;
- ops->start = nmi_start;
- ops->stop = nmi_stop;
- ops->cpu_type = cpu_type;
-
- if (model->init)
- ret = model->init(ops);
- if (ret)
- return ret;
-
- if (!model->num_virt_counters)
- model->num_virt_counters = model->num_counters;
-
- mux_init(ops);
-
- init_suspend_resume();
-
- printk(KERN_INFO "oprofile: using NMI interrupt.\n");
- return 0;
-}
-
-void op_nmi_exit(void)
-{
- exit_suspend_resume();
-}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
deleted file mode 100644
index 0b7b7b179cbe..000000000000
--- a/arch/x86/oprofile/op_counter.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * @file op_counter.h
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- */
-
-#ifndef OP_COUNTER_H
-#define OP_COUNTER_H
-
-#define OP_MAX_COUNTER 32
-
-/* Per-perfctr configuration as set via
- * oprofilefs.
- */
-struct op_counter_config {
- unsigned long count;
- unsigned long enabled;
- unsigned long event;
- unsigned long kernel;
- unsigned long user;
- unsigned long unit_mask;
- unsigned long extra;
-};
-
-extern struct op_counter_config counter_config[];
-
-#endif /* OP_COUNTER_H */
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
deleted file mode 100644
index 660a83c8287b..000000000000
--- a/arch/x86/oprofile/op_model_amd.c
+++ /dev/null
@@ -1,542 +0,0 @@
-/*
- * @file op_model_amd.c
- * athlon / K7 / K8 / Family 10h model-specific MSR operations
- *
- * @remark Copyright 2002-2009 OProfile authors
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- * @author Barry Kasindorf <barry.kasindorf@amd.com>
- * @author Jason Yeh <jason.yeh@amd.com>
- * @author Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/device.h>
-#include <linux/pci.h>
-#include <linux/percpu.h>
-
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/nmi.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
-#else
-#define NUM_VIRT_COUNTERS 0
-#endif
-
-#define OP_EVENT_MASK 0x0FFF
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
-
-static int num_counters;
-static unsigned long reset_value[OP_MAX_COUNTER];
-
-#define IBS_FETCH_SIZE 6
-#define IBS_OP_SIZE 12
-
-static u32 ibs_caps;
-
-struct ibs_config {
- unsigned long op_enabled;
- unsigned long fetch_enabled;
- unsigned long max_cnt_fetch;
- unsigned long max_cnt_op;
- unsigned long rand_en;
- unsigned long dispatched_ops;
- unsigned long branch_target;
-};
-
-struct ibs_state {
- u64 ibs_op_ctl;
- int branch_target;
- unsigned long sample_size;
-};
-
-static struct ibs_config ibs_config;
-static struct ibs_state ibs_state;
-
-/*
- * IBS randomization macros
- */
-#define IBS_RANDOM_BITS 12
-#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
-#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
-
-/*
- * 16-bit Linear Feedback Shift Register (LFSR)
- *
- * 16 14 13 11
- * Feedback polynomial = X + X + X + X + 1
- */
-static unsigned int lfsr_random(void)
-{
- static unsigned int lfsr_value = 0xF00D;
- unsigned int bit;
-
- /* Compute next bit to shift in */
- bit = ((lfsr_value >> 0) ^
- (lfsr_value >> 2) ^
- (lfsr_value >> 3) ^
- (lfsr_value >> 5)) & 0x0001;
-
- /* Advance to next register value */
- lfsr_value = (lfsr_value >> 1) | (bit << 15);
-
- return lfsr_value;
-}
-
-/*
- * IBS software randomization
- *
- * The IBS periodic op counter is randomized in software. The lower 12
- * bits of the 20 bit counter are randomized. IbsOpCurCnt is
- * initialized with a 12 bit random value.
- */
-static inline u64 op_amd_randomize_ibs_op(u64 val)
-{
- unsigned int random = lfsr_random();
-
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
- /*
- * Work around if the hw can not write to IbsOpCurCnt
- *
- * Randomize the lower 8 bits of the 16 bit
- * IbsOpMaxCnt [15:0] value in the range of -128 to
- * +127 by adding/subtracting an offset to the
- * maximum count (IbsOpMaxCnt).
- *
- * To avoid over or underflows and protect upper bits
- * starting at bit 16, the initial value for
- * IbsOpMaxCnt must fit in the range from 0x0081 to
- * 0xff80.
- */
- val += (s8)(random >> 4);
- else
- val |= (u64)(random & IBS_RANDOM_MASK) << 32;
-
- return val;
-}
-
-static inline void
-op_amd_handle_ibs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val, ctl;
- struct op_entry entry;
-
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled) {
- rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- if (ctl & IBS_FETCH_VAL) {
- rdmsrl(MSR_AMD64_IBSFETCHLINAD, val);
- oprofile_write_reserve(&entry, regs, val,
- IBS_FETCH_CODE, IBS_FETCH_SIZE);
- oprofile_add_data64(&entry, val);
- oprofile_add_data64(&entry, ctl);
- rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val);
- oprofile_add_data64(&entry, val);
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
- ctl |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
- }
- }
-
- if (ibs_config.op_enabled) {
- rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
- if (ctl & IBS_OP_VAL) {
- rdmsrl(MSR_AMD64_IBSOPRIP, val);
- oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
- ibs_state.sample_size);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA2, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSOPDATA3, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCLINAD, val);
- oprofile_add_data64(&entry, val);
- rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
- oprofile_add_data64(&entry, val);
- if (ibs_state.branch_target) {
- rdmsrl(MSR_AMD64_IBSBRTARGET, val);
- oprofile_add_data(&entry, (unsigned long)val);
- }
- oprofile_write_commit(&entry);
-
- /* reenable the IRQ */
- ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
- }
- }
-}
-
-static inline void op_amd_start_ibs(void)
-{
- u64 val;
-
- if (!ibs_caps)
- return;
-
- memset(&ibs_state, 0, sizeof(ibs_state));
-
- /*
- * Note: Since the max count settings may out of range we
- * write back the actual used values so that userland can read
- * it.
- */
-
- if (ibs_config.fetch_enabled) {
- val = ibs_config.max_cnt_fetch >> 4;
- val = min(val, IBS_FETCH_MAX_CNT);
- ibs_config.max_cnt_fetch = val << 4;
- val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
- val |= IBS_FETCH_ENABLE;
- wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
- }
-
- if (ibs_config.op_enabled) {
- val = ibs_config.max_cnt_op >> 4;
- if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
- /*
- * IbsOpCurCnt not supported. See
- * op_amd_randomize_ibs_op() for details.
- */
- val = clamp(val, 0x0081ULL, 0xFF80ULL);
- ibs_config.max_cnt_op = val << 4;
- } else {
- /*
- * The start value is randomized with a
- * positive offset, we need to compensate it
- * with the half of the randomized range. Also
- * avoid underflows.
- */
- val += IBS_RANDOM_MAXCNT_OFFSET;
- if (ibs_caps & IBS_CAPS_OPCNTEXT)
- val = min(val, IBS_OP_MAX_CNT_EXT);
- else
- val = min(val, IBS_OP_MAX_CNT);
- ibs_config.max_cnt_op =
- (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
- }
- val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
- val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
- val |= IBS_OP_ENABLE;
- ibs_state.ibs_op_ctl = val;
- ibs_state.sample_size = IBS_OP_SIZE;
- if (ibs_config.branch_target) {
- ibs_state.branch_target = 1;
- ibs_state.sample_size++;
- }
- val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
- wrmsrl(MSR_AMD64_IBSOPCTL, val);
- }
-}
-
-static void op_amd_stop_ibs(void)
-{
- if (!ibs_caps)
- return;
-
- if (ibs_config.fetch_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
-
- if (ibs_config.op_enabled)
- /* clear max count and enable */
- wrmsrl(MSR_AMD64_IBSOPCTL, 0);
-}
-
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
- }
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- if (num_counters == AMD64_NUM_COUNTERS_CORE) {
- msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
- msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
- } else {
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- }
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- op_amd_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /* setup reset_value */
- for (i = 0; i < OP_MAX_COUNTER; ++i) {
- if (counter_config[i].enabled
- && msrs->counters[op_x86_virt_to_phys(i)].addr)
- reset_value[i] = counter_config[i].count;
- else
- reset_value[i] = 0;
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
-
- /* setup counter registers */
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
- /* setup control registers */
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[virt]);
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-static int op_amd_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- int virt = op_x86_phys_to_virt(i);
- if (!reset_value[virt])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- /* bit is clear if overflowed: */
- if (val & OP_CTR_OVERFLOW)
- continue;
- oprofile_add_sample(regs, virt);
- wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
- }
-
- op_amd_handle_ibs(regs, msrs);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-static void op_amd_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_start_ibs();
-}
-
-static void op_amd_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- /*
- * Subtle: stop on all counters to avoid race with setting our
- * pm callback
- */
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[op_x86_phys_to_virt(i)])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-
- op_amd_stop_ibs();
-}
-
-/*
- * check and reserve APIC extended interrupt LVT offset for IBS if
- * available
- */
-
-static void init_ibs(void)
-{
- ibs_caps = get_ibs_caps();
-
- if (!ibs_caps)
- return;
-
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
-}
-
-static int (*create_arch_files)(struct dentry *root);
-
-static int setup_ibs_files(struct dentry *root)
-{
- struct dentry *dir;
- int ret = 0;
-
- /* architecture specific files */
- if (create_arch_files)
- ret = create_arch_files(root);
-
- if (ret)
- return ret;
-
- if (!ibs_caps)
- return ret;
-
- /* model specific files */
-
- /* setup some reasonable defaults */
- memset(&ibs_config, 0, sizeof(ibs_config));
- ibs_config.max_cnt_fetch = 250000;
- ibs_config.max_cnt_op = 250000;
-
- if (ibs_caps & IBS_CAPS_FETCHSAM) {
- dir = oprofilefs_mkdir(root, "ibs_fetch");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.fetch_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_fetch);
- oprofilefs_create_ulong(dir, "rand_enable",
- &ibs_config.rand_en);
- }
-
- if (ibs_caps & IBS_CAPS_OPSAM) {
- dir = oprofilefs_mkdir(root, "ibs_op");
- oprofilefs_create_ulong(dir, "enable",
- &ibs_config.op_enabled);
- oprofilefs_create_ulong(dir, "max_count",
- &ibs_config.max_cnt_op);
- if (ibs_caps & IBS_CAPS_OPCNT)
- oprofilefs_create_ulong(dir, "dispatched_ops",
- &ibs_config.dispatched_ops);
- if (ibs_caps & IBS_CAPS_BRNTRGT)
- oprofilefs_create_ulong(dir, "branch_target",
- &ibs_config.branch_target);
- }
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec;
-
-static int op_amd_init(struct oprofile_operations *ops)
-{
- init_ibs();
- create_arch_files = ops->create_files;
- ops->create_files = setup_ibs_files;
-
- if (boot_cpu_data.x86 == 0x15) {
- num_counters = AMD64_NUM_COUNTERS_CORE;
- } else {
- num_counters = AMD64_NUM_COUNTERS;
- }
-
- op_amd_spec.num_counters = num_counters;
- op_amd_spec.num_controls = num_counters;
- op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
-
- return 0;
-}
-
-struct op_x86_model_spec op_amd_spec = {
- /* num_counters/num_controls filled in at runtime */
- .reserved = MSR_AMD_EVENTSEL_RESERVED,
- .event_mask = OP_EVENT_MASK,
- .init = op_amd_init,
- .fill_in_addresses = &op_amd_fill_in_addresses,
- .setup_ctrs = &op_amd_setup_ctrs,
- .check_ctrs = &op_amd_check_ctrs,
- .start = &op_amd_start,
- .stop = &op_amd_stop,
- .shutdown = &op_amd_shutdown,
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- .switch_ctrl = &op_mux_switch_ctrl,
-#endif
-};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
deleted file mode 100644
index ad1d91f475ab..000000000000
--- a/arch/x86/oprofile/op_model_p4.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/**
- * @file op_model_p4.c
- * P4 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- */
-
-#include <linux/oprofile.h>
-#include <linux/smp.h>
-#include <linux/ptrace.h>
-#include <asm/nmi.h>
-#include <asm/msr.h>
-#include <asm/fixmap.h>
-#include <asm/apic.h>
-
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-#define NUM_EVENTS 39
-
-#define NUM_COUNTERS_NON_HT 8
-#define NUM_ESCRS_NON_HT 45
-#define NUM_CCCRS_NON_HT 18
-#define NUM_CONTROLS_NON_HT (NUM_ESCRS_NON_HT + NUM_CCCRS_NON_HT)
-
-#define NUM_COUNTERS_HT2 4
-#define NUM_ESCRS_HT2 23
-#define NUM_CCCRS_HT2 9
-#define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2)
-
-#define OP_CTR_OVERFLOW (1ULL<<31)
-
-static unsigned int num_counters = NUM_COUNTERS_NON_HT;
-static unsigned int num_controls = NUM_CONTROLS_NON_HT;
-
-/* this has to be checked dynamically since the
- hyper-threadedness of a chip is discovered at
- kernel boot-time. */
-static inline void setup_num_counters(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings == 2) {
- num_counters = NUM_COUNTERS_HT2;
- num_controls = NUM_CONTROLS_HT2;
- }
-#endif
-}
-
-static inline int addr_increment(void)
-{
-#ifdef CONFIG_SMP
- return smp_num_siblings == 2 ? 2 : 1;
-#else
- return 1;
-#endif
-}
-
-
-/* tables to simulate simplified hardware view of p4 registers */
-struct p4_counter_binding {
- int virt_counter;
- int counter_address;
- int cccr_address;
-};
-
-struct p4_event_binding {
- int escr_select; /* value to put in CCCR */
- int event_select; /* value to put in ESCR */
- struct {
- int virt_counter; /* for this counter... */
- int escr_address; /* use this ESCR */
- } bindings[2];
-};
-
-/* nb: these CTR_* defines are a duplicate of defines in
- event/i386.p4*events. */
-
-
-#define CTR_BPU_0 (1 << 0)
-#define CTR_MS_0 (1 << 1)
-#define CTR_FLAME_0 (1 << 2)
-#define CTR_IQ_4 (1 << 3)
-#define CTR_BPU_2 (1 << 4)
-#define CTR_MS_2 (1 << 5)
-#define CTR_FLAME_2 (1 << 6)
-#define CTR_IQ_5 (1 << 7)
-
-static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
- { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
- { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
- { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
- { CTR_IQ_4, MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_CCCR4 },
- { CTR_BPU_2, MSR_P4_BPU_PERFCTR2, MSR_P4_BPU_CCCR2 },
- { CTR_MS_2, MSR_P4_MS_PERFCTR2, MSR_P4_MS_CCCR2 },
- { CTR_FLAME_2, MSR_P4_FLAME_PERFCTR2, MSR_P4_FLAME_CCCR2 },
- { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
-};
-
-#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
-
-/* p4 event codes in libop/op_event.h are indices into this table. */
-
-static struct p4_event_binding p4_events[NUM_EVENTS] = {
-
- { /* BRANCH_RETIRED */
- 0x05, 0x06,
- { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
- {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* MISPRED_BRANCH_RETIRED */
- 0x04, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* TC_DELIVER_MODE */
- 0x01, 0x01,
- { { CTR_MS_0, MSR_P4_TC_ESCR0},
- { CTR_MS_2, MSR_P4_TC_ESCR1} }
- },
-
- { /* BPU_FETCH_REQUEST */
- 0x00, 0x03,
- { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
- { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
- },
-
- { /* ITLB_REFERENCE */
- 0x03, 0x18,
- { { CTR_BPU_0, MSR_P4_ITLB_ESCR0},
- { CTR_BPU_2, MSR_P4_ITLB_ESCR1} }
- },
-
- { /* MEMORY_CANCEL */
- 0x05, 0x02,
- { { CTR_FLAME_0, MSR_P4_DAC_ESCR0},
- { CTR_FLAME_2, MSR_P4_DAC_ESCR1} }
- },
-
- { /* MEMORY_COMPLETE */
- 0x02, 0x08,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* LOAD_PORT_REPLAY */
- 0x02, 0x04,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* STORE_PORT_REPLAY */
- 0x02, 0x05,
- { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
- { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
- },
-
- { /* MOB_LOAD_REPLAY */
- 0x02, 0x03,
- { { CTR_BPU_0, MSR_P4_MOB_ESCR0},
- { CTR_BPU_2, MSR_P4_MOB_ESCR1} }
- },
-
- { /* PAGE_WALK_TYPE */
- 0x04, 0x01,
- { { CTR_BPU_0, MSR_P4_PMH_ESCR0},
- { CTR_BPU_2, MSR_P4_PMH_ESCR1} }
- },
-
- { /* BSQ_CACHE_REFERENCE */
- 0x07, 0x0c,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
- },
-
- { /* IOQ_ALLOCATION */
- 0x06, 0x03,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { 0, 0 } }
- },
-
- { /* IOQ_ACTIVE_ENTRIES */
- 0x06, 0x1a,
- { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
- { 0, 0 } }
- },
-
- { /* FSB_DATA_ACTIVITY */
- 0x06, 0x17,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* BSQ_ALLOCATION */
- 0x07, 0x05,
- { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
- { 0, 0 } }
- },
-
- { /* BSQ_ACTIVE_ENTRIES */
- 0x07, 0x06,
- { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
- { 0, 0 } }
- },
-
- { /* X87_ASSIST */
- 0x05, 0x03,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* SSE_INPUT_ASSIST */
- 0x01, 0x34,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_SP_UOP */
- 0x01, 0x08,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* PACKED_DP_UOP */
- 0x01, 0x0c,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_SP_UOP */
- 0x01, 0x0a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* SCALAR_DP_UOP */
- 0x01, 0x0e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 64BIT_MMX_UOP */
- 0x01, 0x02,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* 128BIT_MMX_UOP */
- 0x01, 0x1a,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_FP_UOP */
- 0x01, 0x04,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* X87_SIMD_MOVES_UOP */
- 0x01, 0x2e,
- { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
- { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
- },
-
- { /* MACHINE_CLEAR */
- 0x05, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* GLOBAL_POWER_EVENTS */
- 0x06, 0x13 /* older manual says 0x05, newer 0x13 */,
- { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
- { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
- },
-
- { /* TC_MS_XFER */
- 0x00, 0x05,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* UOP_QUEUE_WRITES */
- 0x00, 0x09,
- { { CTR_MS_0, MSR_P4_MS_ESCR0},
- { CTR_MS_2, MSR_P4_MS_ESCR1} }
- },
-
- { /* FRONT_END_EVENT */
- 0x05, 0x08,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* EXECUTION_EVENT */
- 0x05, 0x0c,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* REPLAY_EVENT */
- 0x05, 0x09,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
- { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
- },
-
- { /* INSTR_RETIRED */
- 0x04, 0x02,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOPS_RETIRED */
- 0x04, 0x01,
- { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
- { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
- },
-
- { /* UOP_TYPE */
- 0x02, 0x02,
- { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
- { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
- },
-
- { /* RETIRED_MISPRED_BRANCH_TYPE */
- 0x02, 0x05,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- },
-
- { /* RETIRED_BRANCH_TYPE */
- 0x02, 0x04,
- { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
- { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
- }
-};
-
-
-#define MISC_PMC_ENABLED_P(x) ((x) & 1 << 7)
-
-#define ESCR_RESERVED_BITS 0x80000003
-#define ESCR_CLEAR(escr) ((escr) &= ESCR_RESERVED_BITS)
-#define ESCR_SET_USR_0(escr, usr) ((escr) |= (((usr) & 1) << 2))
-#define ESCR_SET_OS_0(escr, os) ((escr) |= (((os) & 1) << 3))
-#define ESCR_SET_USR_1(escr, usr) ((escr) |= (((usr) & 1)))
-#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
-#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
-#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
-
-#define CCCR_RESERVED_BITS 0x38030FFF
-#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
-#define CCCR_SET_REQUIRED_BITS(cccr) ((cccr) |= 0x00030000)
-#define CCCR_SET_ESCR_SELECT(cccr, sel) ((cccr) |= (((sel) & 0x07) << 13))
-#define CCCR_SET_PMI_OVF_0(cccr) ((cccr) |= (1<<26))
-#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
-#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
-#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
-#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
-#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
-
-
-/* this assigns a "stagger" to the current CPU, which is used throughout
- the code in this module as an extra array offset, to select the "even"
- or "odd" part of all the divided resources. */
-static unsigned int get_stagger(void)
-{
-#ifdef CONFIG_SMP
- int cpu = smp_processor_id();
- return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
-#endif
- return 0;
-}
-
-
-/* finally, mediate access to a real hardware counter
- by passing a "virtual" counter numer to this macro,
- along with your stagger setting. */
-#define VIRT_CTR(stagger, i) ((i) + ((num_counters) * (stagger)))
-
-static unsigned long reset_value[NUM_COUNTERS_NON_HT];
-
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (msrs->counters[i].addr)
- release_perfctr_nmi(msrs->counters[i].addr);
- }
- /*
- * some of the control registers are specially reserved in
- * conjunction with the counter registers (hence the starting offset).
- * This saves a few bits.
- */
- for (i = num_counters; i < num_controls; ++i) {
- if (msrs->controls[i].addr)
- release_evntsel_nmi(msrs->controls[i].addr);
- }
-}
-
-static int p4_fill_in_addresses(struct op_msrs * const msrs)
-{
- unsigned int i;
- unsigned int addr, cccraddr, stag;
-
- setup_num_counters();
- stag = get_stagger();
-
- /* the counter & cccr registers we pay attention to */
- for (i = 0; i < num_counters; ++i) {
- addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
- cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
- if (reserve_perfctr_nmi(addr)) {
- msrs->counters[i].addr = addr;
- msrs->controls[i].addr = cccraddr;
- }
- }
-
- /* 43 ESCR registers in three or four discontiguous group */
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr < MSR_P4_IQ_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* no IQ_ESCR0/1 on some models, we save a seconde time BSU_ESCR0/1
- * to avoid special case in nmi_{save|restore}_registers() */
- if (boot_cpu_data.x86_model >= 0x3) {
- for (addr = MSR_P4_BSU_ESCR0 + stag;
- addr <= MSR_P4_BSU_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- } else {
- for (addr = MSR_P4_IQ_ESCR0 + stag;
- addr <= MSR_P4_IQ_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
- }
-
- for (addr = MSR_P4_RAT_ESCR0 + stag;
- addr <= MSR_P4_SSU_ESCR0; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_MS_ESCR0 + stag;
- addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- for (addr = MSR_P4_IX_ESCR0 + stag;
- addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
- if (reserve_evntsel_nmi(addr))
- msrs->controls[i].addr = addr;
- }
-
- /* there are 2 remaining non-contiguously located ESCRs */
-
- if (num_counters == NUM_COUNTERS_NON_HT) {
- /* standard non-HT CPUs handle both remaining ESCRs*/
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else if (stag == 0) {
- /* HT CPUs give the first remainder to the even thread, as
- the 32nd control register */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR4))
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR4;
-
- } else {
- /* and two copies of the second to the odd thread,
- for the 22st and 23nd control registers */
- if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) {
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
- }
- }
-
- for (i = 0; i < num_counters; ++i) {
- if (!counter_config[i].enabled)
- continue;
- if (msrs->controls[i].addr)
- continue;
- op_x86_warn_reserved(i);
- p4_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void pmc_setup_one_p4_counter(unsigned int ctr)
-{
- int i;
- int const maxbind = 2;
- unsigned int cccr = 0;
- unsigned int escr = 0;
- unsigned int high = 0;
- unsigned int counter_bit;
- struct p4_event_binding *ev = NULL;
- unsigned int stag;
-
- stag = get_stagger();
-
- /* convert from counter *number* to counter *bit* */
- counter_bit = 1 << VIRT_CTR(stag, ctr);
-
- /* find our event binding structure. */
- if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx out of range\n",
- counter_config[ctr].event);
- return;
- }
-
- ev = &(p4_events[counter_config[ctr].event - 1]);
-
- for (i = 0; i < maxbind; i++) {
- if (ev->bindings[i].virt_counter & counter_bit) {
-
- /* modify ESCR */
- rdmsr(ev->bindings[i].escr_address, escr, high);
- ESCR_CLEAR(escr);
- if (stag == 0) {
- ESCR_SET_USR_0(escr, counter_config[ctr].user);
- ESCR_SET_OS_0(escr, counter_config[ctr].kernel);
- } else {
- ESCR_SET_USR_1(escr, counter_config[ctr].user);
- ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
- }
- ESCR_SET_EVENT_SELECT(escr, ev->event_select);
- ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
- wrmsr(ev->bindings[i].escr_address, escr, high);
-
- /* modify CCCR */
- rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- CCCR_CLEAR(cccr);
- CCCR_SET_REQUIRED_BITS(cccr);
- CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
- if (stag == 0)
- CCCR_SET_PMI_OVF_0(cccr);
- else
- CCCR_SET_PMI_OVF_1(cccr);
- wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address,
- cccr, high);
- return;
- }
- }
-
- printk(KERN_ERR
- "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
- counter_config[ctr].event, stag, ctr);
-}
-
-
-static void p4_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- unsigned int i;
- unsigned int low, high;
- unsigned int stag;
-
- stag = get_stagger();
-
- rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!MISC_PMC_ENABLED_P(low)) {
- printk(KERN_ERR "oprofile: P4 PMC not available\n");
- return;
- }
-
- /* clear the cccrs we will use */
- for (i = 0; i < num_counters; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_CLEAR(low);
- CCCR_SET_REQUIRED_BITS(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-
- /* clear all escrs (including those outside our concern) */
- for (i = num_counters; i < num_controls; i++) {
- if (unlikely(!msrs->controls[i].addr))
- continue;
- wrmsr(msrs->controls[i].addr, 0, 0);
- }
-
- /* setup all counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->controls[i].addr) {
- reset_value[i] = counter_config[i].count;
- pmc_setup_one_p4_counter(i);
- wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address,
- -(u64)counter_config[i].count);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int p4_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- unsigned long ctr, low, high, stag, real;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
-
- if (!reset_value[i])
- continue;
-
- /*
- * there is some eccentricity in the hardware which
- * requires that we perform 2 extra corrections:
- *
- * - check both the CCCR:OVF flag for overflow and the
- * counter high bit for un-flagged overflows.
- *
- * - write the counter back twice to ensure it gets
- * updated properly.
- *
- * the former seems to be related to extra NMIs happening
- * during the current NMI; the latter is reported as errata
- * N15 in intel doc 249199-029, pentium 4 specification
- * update, though their suggested work-around does not
- * appear to solve the problem.
- */
-
- real = VIRT_CTR(stag, i);
-
- rdmsr(p4_counters[real].cccr_address, low, high);
- rdmsr(p4_counters[real].counter_address, ctr, high);
- if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) {
- oprofile_add_sample(regs, i);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- CCCR_CLEAR_OVF(low);
- wrmsr(p4_counters[real].cccr_address, low, high);
- wrmsrl(p4_counters[real].counter_address,
- -(u64)reset_value[i]);
- }
- }
-
- /* P4 quirk: you have to re-unmask the apic vector */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* See op_model_ppro.c */
- return 1;
-}
-
-
-static void p4_start(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_ENABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-
-static void p4_stop(struct op_msrs const * const msrs)
-{
- unsigned int low, high, stag;
- int i;
-
- stag = get_stagger();
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- CCCR_SET_DISABLE(low);
- wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
- }
-}
-
-#ifdef CONFIG_SMP
-struct op_x86_model_spec op_p4_ht2_spec = {
- .num_counters = NUM_COUNTERS_HT2,
- .num_controls = NUM_CONTROLS_HT2,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
-#endif
-
-struct op_x86_model_spec op_p4_spec = {
- .num_counters = NUM_COUNTERS_NON_HT,
- .num_controls = NUM_CONTROLS_NON_HT,
- .fill_in_addresses = &p4_fill_in_addresses,
- .setup_ctrs = &p4_setup_ctrs,
- .check_ctrs = &p4_check_ctrs,
- .start = &p4_start,
- .stop = &p4_stop,
- .shutdown = &p4_shutdown
-};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
deleted file mode 100644
index 7913b6921959..000000000000
--- a/arch/x86/oprofile/op_model_ppro.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * @file op_model_ppro.h
- * Family 6 perfmon and architectural perfmon MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Copyright 2008 Intel Corporation
- * @remark Read the file COPYING
- *
- * @author John Levon
- * @author Philippe Elie
- * @author Graydon Hoare
- * @author Andi Kleen
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#include <linux/oprofile.h>
-#include <linux/slab.h>
-#include <asm/ptrace.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-#include <asm/nmi.h>
-
-#include "op_x86_model.h"
-#include "op_counter.h"
-
-static int num_counters = 2;
-static int counter_width = 32;
-
-#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21))
-
-static u64 reset_value[OP_MAX_COUNTER];
-
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->counters[i].addr)
- continue;
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
- }
-}
-
-static int ppro_fill_in_addresses(struct op_msrs * const msrs)
-{
- int i;
-
- for (i = 0; i < num_counters; i++) {
- if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
- goto fail;
- if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
- release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
- goto fail;
- }
- /* both registers must be reserved */
- msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
- msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
- continue;
- fail:
- if (!counter_config[i].enabled)
- continue;
- op_x86_warn_reserved(i);
- ppro_shutdown(msrs);
- return -EBUSY;
- }
-
- return 0;
-}
-
-
-static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
- union cpuid10_eax eax;
- eax.full = cpuid_eax(0xa);
-
- /*
- * For Core2 (family 6, model 15), don't reset the
- * counter width:
- */
- if (!(eax.split.version_id == 0 &&
- __this_cpu_read(cpu_info.x86) == 6 &&
- __this_cpu_read(cpu_info.x86_model) == 15)) {
-
- if (counter_width < eax.split.bit_width)
- counter_width = eax.split.bit_width;
- }
- }
-
- /* clear all counters */
- for (i = 0; i < num_counters; ++i) {
- if (!msrs->controls[i].addr)
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
- op_x86_warn_in_use(i);
- val &= model->reserved;
- wrmsrl(msrs->controls[i].addr, val);
- /*
- * avoid a false detection of ctr overflows in NMI *
- * handler
- */
- wrmsrl(msrs->counters[i].addr, -1LL);
- }
-
- /* enable active counters */
- for (i = 0; i < num_counters; ++i) {
- if (counter_config[i].enabled && msrs->counters[i].addr) {
- reset_value[i] = counter_config[i].count;
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- rdmsrl(msrs->controls[i].addr, val);
- val &= model->reserved;
- val |= op_x86_get_ctrl(model, &counter_config[i]);
- wrmsrl(msrs->controls[i].addr, val);
- } else {
- reset_value[i] = 0;
- }
- }
-}
-
-
-static int ppro_check_ctrs(struct pt_regs * const regs,
- struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->counters[i].addr, val);
- if (val & (1ULL << (counter_width - 1)))
- continue;
- oprofile_add_sample(regs, i);
- wrmsrl(msrs->counters[i].addr, -reset_value[i]);
- }
-
- /* Only P6 based Pentium M need to re-unmask the apic vector but it
- * doesn't hurt other P6 variant */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-
- /* We can't work out if we really handled an interrupt. We
- * might have caught a *second* counter just after overflowing
- * the interrupt for this counter then arrives
- * and we don't find a counter that's overflowed, so we
- * would return 0 and get dazed + confused. Instead we always
- * assume we found an overflow. This sucks.
- */
- return 1;
-}
-
-
-static void ppro_start(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (reset_value[i]) {
- rdmsrl(msrs->controls[i].addr, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
- }
-}
-
-
-static void ppro_stop(struct op_msrs const * const msrs)
-{
- u64 val;
- int i;
-
- for (i = 0; i < num_counters; ++i) {
- if (!reset_value[i])
- continue;
- rdmsrl(msrs->controls[i].addr, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(msrs->controls[i].addr, val);
- }
-}
-
-struct op_x86_model_spec op_ppro_spec = {
- .num_counters = 2,
- .num_controls = 2,
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .fill_in_addresses = &ppro_fill_in_addresses,
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
-
-/*
- * Architectural performance monitoring.
- *
- * Newer Intel CPUs (Core1+) have support for architectural
- * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
- * The advantage of this is that it can be done without knowing about
- * the specific CPU.
- */
-
-static void arch_perfmon_setup_counters(void)
-{
- union cpuid10_eax eax;
-
- eax.full = cpuid_eax(0xa);
-
- /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
- if (eax.split.version_id == 0 && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 15) {
- eax.split.version_id = 2;
- eax.split.num_counters = 2;
- eax.split.bit_width = 40;
- }
-
- num_counters = min((int)eax.split.num_counters, OP_MAX_COUNTER);
-
- op_arch_perfmon_spec.num_counters = num_counters;
- op_arch_perfmon_spec.num_controls = num_counters;
-}
-
-static int arch_perfmon_init(struct oprofile_operations *ignore)
-{
- arch_perfmon_setup_counters();
- return 0;
-}
-
-struct op_x86_model_spec op_arch_perfmon_spec = {
- .reserved = MSR_PPRO_EVENTSEL_RESERVED,
- .init = &arch_perfmon_init,
- /* num_counters/num_controls filled in at runtime */
- .fill_in_addresses = &ppro_fill_in_addresses,
- /* user space does the cpuid check for available events */
- .setup_ctrs = &ppro_setup_ctrs,
- .check_ctrs = &ppro_check_ctrs,
- .start = &ppro_start,
- .stop = &ppro_stop,
- .shutdown = &ppro_shutdown
-};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
deleted file mode 100644
index 276cf79b5d24..000000000000
--- a/arch/x86/oprofile/op_x86_model.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- * @file op_x86_model.h
- * interface to x86 model-specific MSR operations
- *
- * @remark Copyright 2002 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Graydon Hoare
- * @author Robert Richter <robert.richter@amd.com>
- */
-
-#ifndef OP_X86_MODEL_H
-#define OP_X86_MODEL_H
-
-#include <asm/types.h>
-#include <asm/perf_event.h>
-
-struct op_msr {
- unsigned long addr;
- u64 saved;
-};
-
-struct op_msrs {
- struct op_msr *counters;
- struct op_msr *controls;
- struct op_msr *multiplex;
-};
-
-struct pt_regs;
-
-struct oprofile_operations;
-
-/* The model vtable abstracts the differences between
- * various x86 CPU models' perfctr support.
- */
-struct op_x86_model_spec {
- unsigned int num_counters;
- unsigned int num_controls;
- unsigned int num_virt_counters;
- u64 reserved;
- u16 event_mask;
- int (*init)(struct oprofile_operations *ops);
- int (*fill_in_addresses)(struct op_msrs * const msrs);
- void (*setup_ctrs)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
- int (*check_ctrs)(struct pt_regs * const regs,
- struct op_msrs const * const msrs);
- void (*start)(struct op_msrs const * const msrs);
- void (*stop)(struct op_msrs const * const msrs);
- void (*shutdown)(struct op_msrs const * const msrs);
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
- void (*switch_ctrl)(struct op_x86_model_spec const *model,
- struct op_msrs const * const msrs);
-#endif
-};
-
-struct op_counter_config;
-
-static inline void op_x86_warn_in_use(int counter)
-{
- /*
- * The warning indicates an already running counter. If
- * oprofile doesn't collect data, then try using a different
- * performance counter on your platform to monitor the desired
- * event. Delete counter #%d from the desired event by editing
- * the /usr/share/oprofile/%s/<cpu>/events file. If the event
- * cannot be monitored by any other counter, contact your
- * hardware or BIOS vendor.
- */
- pr_warn("oprofile: counter #%d on cpu #%d may already be used\n",
- counter, smp_processor_id());
-}
-
-static inline void op_x86_warn_reserved(int counter)
-{
- pr_warn("oprofile: counter #%d is already reserved\n", counter);
-}
-
-extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
- struct op_counter_config *counter_config);
-extern int op_x86_phys_to_virt(int phys);
-extern int op_x86_virt_to_phys(int virt);
-
-extern struct op_x86_model_spec op_ppro_spec;
-extern struct op_x86_model_spec op_p4_spec;
-extern struct op_x86_model_spec op_p4_ht2_spec;
-extern struct op_x86_model_spec op_amd_spec;
-extern struct op_x86_model_spec op_arch_perfmon_spec;
-
-#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 95e2e6bd8d8c..8edd62206604 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -28,10 +28,12 @@
#include <linux/io.h>
#include <linux/smp.h>
+#include <asm/cpu_device_id.h>
#include <asm/segment.h>
#include <asm/pci_x86.h>
#include <asm/hw_irq.h>
#include <asm/io_apic.h>
+#include <asm/intel-family.h>
#include <asm/intel-mid.h>
#include <asm/acpi.h>
@@ -140,6 +142,7 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
* type1_access_ok - check whether to use type 1
* @bus: bus number
* @devfn: device & function in question
+ * @reg: configuration register offset
*
* If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
* all, the we can go ahead with any reads & writes. If it's on a Lincroft,
@@ -212,10 +215,17 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
where, size, value);
}
+static const struct x86_cpu_id intel_mid_cpu_ids[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, NULL),
+ {}
+};
+
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
+ const struct x86_cpu_id *id;
struct irq_alloc_info info;
bool polarity_low;
+ u16 model = 0;
int ret;
u8 gsi;
@@ -228,8 +238,12 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
return ret;
}
- switch (intel_mid_identify_cpu()) {
- case INTEL_MID_CPU_CHIP_TANGIER:
+ id = x86_match_cpu(intel_mid_cpu_ids);
+ if (id)
+ model = id->model;
+
+ switch (model) {
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
polarity_low = false;
/* Special treatment for IRQ0 */
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 234998f196d4..de6bf0e7e8f8 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -11,9 +11,9 @@
* themselves.
*/
+#include <linux/acpi.h>
#include <linux/pci.h>
#include <linux/init.h>
-#include <linux/sfi_acpi.h>
#include <linux/bitmap.h>
#include <linux/dmi.h>
#include <linux/slab.h>
@@ -665,7 +665,7 @@ void __init pci_mmcfg_early_init(void)
if (pci_mmcfg_check_hostbridge())
known_bridge = 1;
else
- acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+ acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
__pci_mmcfg_init(1);
set_apei_filter();
@@ -683,7 +683,7 @@ void __init pci_mmcfg_late_init(void)
/* MMCONFIG hasn't been enabled yet, try again */
if (pci_probe & PCI_PROBE_MASK & ~PCI_PROBE_MMCONF) {
- acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
+ acpi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
__pci_mmcfg_init(0);
}
}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index d0e835470d01..3ed03a2552d0 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -4,13 +4,11 @@ obj-y += atom/
obj-y += ce4100/
obj-y += efi/
obj-y += geode/
-obj-y += goldfish/
obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
obj-y += intel-quark/
obj-y += olpc/
obj-y += scx200/
-obj-y += sfi/
obj-y += ts5500/
obj-y += uv/
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8efd003540ca..1b82d77019b1 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -54,10 +54,7 @@
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
-
-struct efi_scratch efi_scratch;
-
-EXPORT_SYMBOL_GPL(efi_mm);
+static struct mm_struct *efi_prev_mm;
/*
* We need our own copy of the higher levels of the page tables
@@ -237,7 +234,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1;
}
- efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
+ efi_mixed_mode_stack_pa = page_to_phys(page + 1); /* stack grows down */
npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
@@ -462,11 +459,17 @@ void __init efi_dump_pagetable(void)
* can not change under us.
* It should be ensured that there are no concurent calls to this function.
*/
-void efi_switch_mm(struct mm_struct *mm)
+void efi_enter_mm(void)
+{
+ efi_prev_mm = current->active_mm;
+ current->active_mm = &efi_mm;
+ switch_mm(efi_prev_mm, &efi_mm, NULL);
+}
+
+void efi_leave_mm(void)
{
- efi_scratch.prev_mm = current->active_mm;
- current->active_mm = mm;
- switch_mm(efi_scratch.prev_mm, mm, NULL);
+ current->active_mm = efi_prev_mm;
+ switch_mm(&efi_mm, efi_prev_mm, NULL);
}
static DEFINE_SPINLOCK(efi_runtime_lock);
@@ -530,12 +533,12 @@ efi_thunk_set_virtual_address_map(unsigned long memory_map_size,
efi_sync_low_kernel_mappings();
local_irq_save(flags);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
status = __efi_thunk(set_virtual_address_map, memory_map_size,
descriptor_size, descriptor_version, virtual_map);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
local_irq_restore(flags);
return status;
@@ -829,9 +832,9 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_size,
descriptor_version,
virtual_map);
- efi_switch_mm(&efi_mm);
+ efi_enter_mm();
- kernel_fpu_begin();
+ efi_fpu_begin();
/* Disable interrupts around EFI calls: */
local_irq_save(flags);
@@ -840,12 +843,12 @@ efi_set_virtual_address_map(unsigned long memory_map_size,
descriptor_version, virtual_map);
local_irq_restore(flags);
- kernel_fpu_end();
+ efi_fpu_end();
/* grab the virtually remapped EFI runtime services table pointer */
efi.runtime = READ_ONCE(systab->runtime);
- efi_switch_mm(efi_scratch.prev_mm);
+ efi_leave_mm();
return status;
}
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 26f0da238c1c..fd3dd1708eba 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -33,7 +33,7 @@ SYM_CODE_START(__efi64_thunk)
* Switch to 1:1 mapped 32-bit stack pointer.
*/
movq %rsp, %rax
- movq efi_scratch(%rip), %rsp
+ movq efi_mixed_mode_stack_pa(%rip), %rsp
push %rax
/*
@@ -70,3 +70,7 @@ SYM_CODE_START(__efi64_thunk)
pushl %ebp
lret
SYM_CODE_END(__efi64_thunk)
+
+ .bss
+ .balign 8
+SYM_DATA(efi_mixed_mode_stack_pa, .quad 0)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 5a40fe411ebd..67d93a243c35 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -687,15 +687,25 @@ int efi_capsule_setup_info(struct capsule_info *cap_info, void *kbuff,
* @return: Returns, if the page fault is not handled. This function
* will never return if the page fault is handled successfully.
*/
-void efi_recover_from_page_fault(unsigned long phys_addr)
+void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
{
if (!IS_ENABLED(CONFIG_X86_64))
return;
/*
+ * If we get an interrupt/NMI while processing an EFI runtime service
+ * then this is a regular OOPS, not an EFI failure.
+ */
+ if (in_interrupt())
+ return;
+
+ /*
* Make sure that an efi runtime service caused the page fault.
+ * READ_ONCE() because we might be OOPSing in a different thread,
+ * and we don't want to trip KTSAN while trying to OOPS.
*/
- if (efi_rts_work.efi_rts_id == EFI_NONE)
+ if (READ_ONCE(efi_rts_work.efi_rts_id) == EFI_NONE ||
+ current_work() != &efi_rts_work.work)
return;
/*
@@ -747,6 +757,4 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
set_current_state(TASK_IDLE);
schedule();
}
-
- return;
}
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index c33f744b5388..b39bf3b5e108 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -22,6 +22,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -69,21 +70,15 @@ static struct platform_device alix_buttons_dev = {
static struct gpio_led alix_leds[] = {
{
.name = "alix:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "alix:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "alix:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -92,6 +87,17 @@ static struct gpio_led_platform_data alix_leds_data = {
.leds = alix_leds,
};
+static struct gpiod_lookup_table alix_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device alix_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -106,6 +112,7 @@ static struct platform_device *alix_devs[] __initdata = {
static void __init register_alix(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&alix_leds_gpio_table);
platform_add_devices(alix_devs, ARRAY_SIZE(alix_devs));
}
diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index 73a3f49b4eb6..d263528c90bb 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <linux/dmi.h>
#include <asm/geode.h>
@@ -53,21 +54,15 @@ static struct platform_device geos_buttons_dev = {
static struct gpio_led geos_leds[] = {
{
.name = "geos:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 1,
},
{
.name = "geos:2",
- .gpio = 25,
.default_trigger = "default-off",
- .active_low = 1,
},
{
.name = "geos:3",
- .gpio = 27,
.default_trigger = "default-off",
- .active_low = 1,
},
};
@@ -76,6 +71,17 @@ static struct gpio_led_platform_data geos_leds_data = {
.leds = geos_leds,
};
+static struct gpiod_lookup_table geos_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 25, NULL, 1, GPIO_ACTIVE_LOW),
+ GPIO_LOOKUP_IDX("cs5535-gpio", 27, NULL, 2, GPIO_ACTIVE_LOW),
+ { }
+ },
+};
+
static struct platform_device geos_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -90,6 +96,7 @@ static struct platform_device *geos_devs[] __initdata = {
static void __init register_geos(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&geos_leds_gpio_table);
platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs));
}
diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 163e1b545517..558384acd777 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,6 +20,7 @@
#include <linux/platform_device.h>
#include <linux/input.h>
#include <linux/gpio_keys.h>
+#include <linux/gpio/machine.h>
#include <asm/geode.h>
@@ -55,9 +56,7 @@ static struct platform_device net5501_buttons_dev = {
static struct gpio_led net5501_leds[] = {
{
.name = "net5501:1",
- .gpio = 6,
.default_trigger = "default-on",
- .active_low = 0,
},
};
@@ -66,6 +65,15 @@ static struct gpio_led_platform_data net5501_leds_data = {
.leds = net5501_leds,
};
+static struct gpiod_lookup_table net5501_leds_gpio_table = {
+ .dev_id = "leds-gpio",
+ .table = {
+ /* The Geode GPIOs should be on the CS5535 companion chip */
+ GPIO_LOOKUP_IDX("cs5535-gpio", 6, NULL, 0, GPIO_ACTIVE_HIGH),
+ { }
+ },
+};
+
static struct platform_device net5501_leds_dev = {
.name = "leds-gpio",
.id = -1,
@@ -80,6 +88,7 @@ static struct platform_device *net5501_devs[] __initdata = {
static void __init register_net5501(void)
{
/* Setup LED control through leds-gpio driver */
+ gpiod_add_lookup_table(&net5501_leds_gpio_table);
platform_add_devices(net5501_devs, ARRAY_SIZE(net5501_devs));
}
diff --git a/arch/x86/platform/goldfish/Makefile b/arch/x86/platform/goldfish/Makefile
deleted file mode 100644
index 072c395379ac..000000000000
--- a/arch/x86/platform/goldfish/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_GOLDFISH) += goldfish.o
diff --git a/arch/x86/platform/goldfish/goldfish.c b/arch/x86/platform/goldfish/goldfish.c
deleted file mode 100644
index 6b6f8b4360dd..000000000000
--- a/arch/x86/platform/goldfish/goldfish.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2007 Google, Inc.
- * Copyright (C) 2011 Intel, Inc.
- * Copyright (C) 2013 Intel, Inc.
- */
-
-#include <linux/kernel.h>
-#include <linux/irq.h>
-#include <linux/platform_device.h>
-
-/*
- * Where in virtual device memory the IO devices (timers, system controllers
- * and so on)
- */
-
-#define GOLDFISH_PDEV_BUS_BASE (0xff001000)
-#define GOLDFISH_PDEV_BUS_END (0xff7fffff)
-#define GOLDFISH_PDEV_BUS_IRQ (4)
-
-#define GOLDFISH_TTY_BASE (0x2000)
-
-static struct resource goldfish_pdev_bus_resources[] = {
- {
- .start = GOLDFISH_PDEV_BUS_BASE,
- .end = GOLDFISH_PDEV_BUS_END,
- .flags = IORESOURCE_MEM,
- },
- {
- .start = GOLDFISH_PDEV_BUS_IRQ,
- .end = GOLDFISH_PDEV_BUS_IRQ,
- .flags = IORESOURCE_IRQ,
- }
-};
-
-static bool goldfish_enable __initdata;
-
-static int __init goldfish_setup(char *str)
-{
- goldfish_enable = true;
- return 0;
-}
-__setup("goldfish", goldfish_setup);
-
-static int __init goldfish_init(void)
-{
- if (!goldfish_enable)
- return -ENODEV;
-
- platform_device_register_simple("goldfish_pdev_bus", -1,
- goldfish_pdev_bus_resources, 2);
- return 0;
-}
-device_initcall(goldfish_init);
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index cc2549f0ccb1..ddfc08783fb8 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,7 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o pwr.o
-
-# SFI specific code
-ifdef CONFIG_X86_INTEL_MID
-obj-$(CONFIG_SFI) += sfi.o device_libs/
-endif
+obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o pwr.o
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
deleted file mode 100644
index 480fed21cc7d..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Family-Level Interface Shim (FLIS)
-obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
-# SDHCI Devices
-obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
-# WiFi + BT
-obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
-obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
-# IPC Devices
-obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
-obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
-obj-$(subst m,y,$(CONFIG_GPIO_MSIC)) += platform_msic_gpio.o
-obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_ocd.o
-obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic_battery.o
-obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_msic_power_btn.o
-obj-$(subst m,y,$(CONFIG_INTEL_MFLD_THERMAL)) += platform_msic_thermal.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_mrfld_spidev.o
-# I2C Devices
-obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
-obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
-obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o
-obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
-obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# I2C GPIO Expanders
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_max7315.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_pcal9555a.o
-obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
-# MISC Devices
-obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
-obj-$(subst m,y,$(CONFIG_INTEL_MID_POWER_BUTTON)) += platform_mrfld_power_btn.o
-obj-$(subst m,y,$(CONFIG_RTC_DRV_CMOS)) += platform_mrfld_rtc.o
-obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_mrfld_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
deleted file mode 100644
index 564c47c53f3a..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bcm43xx.c: bcm43xx platform data initialization file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/machine.h>
-#include <linux/regulator/fixed.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt"
-#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable"
-
-#define WLAN_DEV_NAME "0000:00:01.3"
-
-static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
- .dev_name = WLAN_DEV_NAME,
- .supply = "vmmc",
-};
-
-static struct regulator_init_data bcm43xx_vmmc_data = {
- .constraints = {
- .valid_ops_mask = REGULATOR_CHANGE_STATUS,
- },
- .num_consumer_supplies = 1,
- .consumer_supplies = &bcm43xx_vmmc_supply,
-};
-
-static struct fixed_voltage_config bcm43xx_vmmc = {
- .supply_name = "bcm43xx-vmmc-regulator",
- /*
- * Announce 2.0V here to be compatible with SDIO specification. The
- * real voltage and signaling are still 1.8V.
- */
- .microvolts = 2000000, /* 1.8V */
- .startup_delay = 250 * 1000, /* 250ms */
- .enabled_at_boot = 0, /* disabled at boot */
- .init_data = &bcm43xx_vmmc_data,
-};
-
-static struct platform_device bcm43xx_vmmc_regulator = {
- .name = "reg-fixed-voltage",
- .id = PLATFORM_DEVID_AUTO,
- .dev = {
- .platform_data = &bcm43xx_vmmc,
- },
-};
-
-static struct gpiod_lookup_table bcm43xx_vmmc_gpio_table = {
- .dev_id = "reg-fixed-voltage.0",
- .table = {
- GPIO_LOOKUP("0000:00:0c.0", -1, NULL, GPIO_ACTIVE_LOW),
- {}
- },
-};
-
-static int __init bcm43xx_regulator_register(void)
-{
- struct gpiod_lookup_table *table = &bcm43xx_vmmc_gpio_table;
- struct gpiod_lookup *lookup = table->table;
- int ret;
-
- lookup[0].chip_hwnum = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
- gpiod_add_lookup_table(table);
-
- ret = platform_device_register(&bcm43xx_vmmc_regulator);
- if (ret) {
- pr_err("%s: vmmc regulator register failed\n", __func__);
- return ret;
- }
-
- return 0;
-}
-
-static void __init *bcm43xx_platform_data(void *info)
-{
- int ret;
-
- ret = bcm43xx_regulator_register();
- if (ret)
- return NULL;
-
- pr_info("Using generic wifi platform data\n");
-
- /* For now it's empty */
- return NULL;
-}
-
-static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
- .name = "bcm43xx_clk_vmmc",
- .type = SFI_DEV_TYPE_SD,
- .get_platform_data = &bcm43xx_platform_data,
-};
-
-sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
deleted file mode 100644
index 32912a17f68e..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_bma023.c: bma023 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- */
-
-#include <asm/intel-mid.h>
-
-static const struct devs_id bma023_dev_id __initconst = {
- .name = "bma023",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
-};
-
-sfi_device(bma023_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
deleted file mode 100644
index 31dda18bb370..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_bt.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Bluetooth platform data initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <linux/pci.h>
-#include <linux/platform_device.h>
-
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-#include <asm/intel-mid.h>
-
-struct bt_sfi_data {
- struct device *dev;
- const char *name;
- int (*setup)(struct bt_sfi_data *ddata);
-};
-
-static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
- .dev_id = "hci_bcm",
- .table = {
- GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
- GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH),
- GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH),
- { },
- },
-};
-
-#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup"
-#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset"
-#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable"
-
-static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
-{
- struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
- struct gpiod_lookup *lookup = table->table;
- struct pci_dev *pdev;
-
- /* Connected to /dev/ttyS0 */
- pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
- if (!pdev)
- return -ENODEV;
-
- ddata->dev = &pdev->dev;
- ddata->name = table->dev_id;
-
- lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
- lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
- lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
-
- gpiod_add_lookup_table(table);
- return 0;
-}
-
-static struct bt_sfi_data tng_bt_sfi_data __initdata = {
- .setup = tng_bt_sfi_setup,
-};
-
-static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &tng_bt_sfi_data),
- {}
-};
-
-static int __init bt_sfi_init(void)
-{
- struct platform_device_info info;
- struct platform_device *pdev;
- const struct x86_cpu_id *id;
- struct bt_sfi_data *ddata;
- int ret;
-
- id = x86_match_cpu(bt_sfi_cpu_ids);
- if (!id)
- return -ENODEV;
-
- ddata = (struct bt_sfi_data *)id->driver_data;
- if (!ddata)
- return -ENODEV;
-
- ret = ddata->setup(ddata);
- if (ret)
- return ret;
-
- memset(&info, 0, sizeof(info));
- info.fwnode = ddata->dev->fwnode;
- info.parent = ddata->dev;
- info.name = ddata->name,
- info.id = PLATFORM_DEVID_NONE,
-
- pdev = platform_device_register_full(&info);
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
-
- dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
- return 0;
-}
-device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
deleted file mode 100644
index a2508582a0b1..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_emc1403.c: emc1403 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void __init *emc1403_platform_data(void *info)
-{
- static short intr2nd_pdata;
- struct i2c_board_info *i2c_info = info;
- int intr = get_gpio_by_name("thermal_int");
- int intr2nd = get_gpio_by_name("thermal_alert");
-
- if (intr < 0)
- return NULL;
- if (intr2nd < 0)
- return NULL;
-
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
- return &intr2nd_pdata;
-}
-
-static const struct devs_id emc1403_dev_id __initconst = {
- .name = "emc1403",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &emc1403_platform_data,
-};
-
-sfi_device(emc1403_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
deleted file mode 100644
index d9435d2196a4..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_gpio_keys.c: gpio_keys platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/input.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/platform_device.h>
-#include <asm/intel-mid.h>
-
-#define DEVICE_NAME "gpio-keys"
-
-/*
- * we will search these buttons in SFI GPIO table (by name)
- * and register them dynamically. Please add all possible
- * buttons here, we will shrink them if no GPIO found.
- */
-static struct gpio_keys_button gpio_button[] = {
- {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000},
- {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20},
- {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20},
- {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20},
- {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20},
- {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20},
- {KEY_MUTE, -1, 1, "mute_enable", EV_KEY, 0, 20},
- {KEY_VOLUMEUP, -1, 1, "volume_up", EV_KEY, 0, 20},
- {KEY_VOLUMEDOWN, -1, 1, "volume_down", EV_KEY, 0, 20},
- {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20},
- {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20},
- {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20},
- {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20},
-};
-
-static struct gpio_keys_platform_data gpio_keys = {
- .buttons = gpio_button,
- .rep = 1,
- .nbuttons = -1, /* will fill it after search */
-};
-
-static struct platform_device pb_device = {
- .name = DEVICE_NAME,
- .id = -1,
- .dev = {
- .platform_data = &gpio_keys,
- },
-};
-
-/*
- * Shrink the non-existent buttons, register the gpio button
- * device if there is some
- */
-static int __init pb_keys_init(void)
-{
- struct gpio_keys_button *gb = gpio_button;
- int i, good = 0;
-
- for (i = 0; i < ARRAY_SIZE(gpio_button); i++) {
- gb[i].gpio = get_gpio_by_name(gb[i].desc);
- pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc,
- gb[i].gpio);
- if (gb[i].gpio < 0)
- continue;
-
- if (i != good)
- gb[good] = gb[i];
- good++;
- }
-
- if (good) {
- gpio_keys.nbuttons = good;
- return platform_device_register(&pb_device);
- }
- return 0;
-}
-late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
deleted file mode 100644
index a4485cd638c6..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_lis331.c: lis331 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-static void __init *lis331dl_platform_data(void *info)
-{
- static short intr2nd_pdata;
- struct i2c_board_info *i2c_info = info;
- int intr = get_gpio_by_name("accel_int");
- int intr2nd = get_gpio_by_name("accel_2");
-
- if (intr < 0)
- return NULL;
- if (intr2nd < 0)
- return NULL;
-
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- intr2nd_pdata = intr2nd + INTEL_MID_IRQ_OFFSET;
-
- return &intr2nd_pdata;
-}
-
-static const struct devs_id lis331dl_dev_id __initconst = {
- .name = "i2c_accel",
- .type = SFI_DEV_TYPE_I2C,
- .get_platform_data = &lis331dl_platform_data,
-};
-
-sfi_device(lis331dl_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
deleted file mode 100644
index e9287c3184da..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_max7315.c: max7315 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <asm/intel-mid.h>
-
-#define MAX7315_NUM 2
-
-static void __init *max7315_platform_data(void *info)
-{
- static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
- static int nr;
- struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
- struct i2c_board_info *i2c_info = info;
- int gpio_base, intr;
- char base_pin_name[SFI_NAME_LEN + 1];
- char intr_pin_name[SFI_NAME_LEN + 1];
-
- if (nr == MAX7315_NUM) {
- pr_err("too many max7315s, we only support %d\n",
- MAX7315_NUM);
- return NULL;
- }
- /* we have several max7315 on the board, we only need load several
- * instances of the same pca953x driver to cover them
- */
- strcpy(i2c_info->type, "max7315");
- if (nr++) {
- snprintf(base_pin_name, sizeof(base_pin_name),
- "max7315_%d_base", nr);
- snprintf(intr_pin_name, sizeof(intr_pin_name),
- "max7315_%d_int", nr);
- } else {
- strcpy(base_pin_name, "max7315_base");
- strcpy(intr_pin_name, "max7315_int");
- }
-
- gpio_base = get_gpio_by_name(base_pin_name);
- intr = get_gpio_by_name(intr_pin_name);
-
- if (gpio_base < 0)
- return NULL;
- max7315->gpio_base = gpio_base;
- if (intr != -1) {
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- max7315->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
- } else {
- i2c_info->irq = -1;
- max7315->irq_base = -1;
- }
- return max7315;
-}
-
-static const struct devs_id max7315_dev_id __initconst = {
- .name = "i2c_max7315",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &max7315_platform_data,
-};
-
-static const struct devs_id max7315_2_dev_id __initconst = {
- .name = "i2c_max7315_2",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &max7315_platform_data,
-};
-
-sfi_device(max7315_dev_id);
-sfi_device(max7315_2_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
deleted file mode 100644
index 28a182713934..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_mpu3050.c: mpu3050 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/i2c.h>
-#include <asm/intel-mid.h>
-
-static void *mpu3050_platform_data(void *info)
-{
- struct i2c_board_info *i2c_info = info;
- int intr = get_gpio_by_name("mpu3050_int");
-
- if (intr < 0)
- return NULL;
-
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- return NULL;
-}
-
-static const struct devs_id mpu3050_dev_id __initconst = {
- .name = "mpu3050",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &mpu3050_platform_data,
-};
-
-sfi_device(mpu3050_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
deleted file mode 100644
index 605e1f94ad89..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_pinctrl.c
+++ /dev/null
@@ -1,39 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield FLIS platform device initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/platform_device.h>
-
-#include <asm/intel-mid.h>
-
-#define FLIS_BASE_ADDR 0xff0c0000
-#define FLIS_LENGTH 0x8000
-
-static struct resource mrfld_pinctrl_mmio_resource = {
- .start = FLIS_BASE_ADDR,
- .end = FLIS_BASE_ADDR + FLIS_LENGTH - 1,
- .flags = IORESOURCE_MEM,
-};
-
-static struct platform_device mrfld_pinctrl_device = {
- .name = "pinctrl-merrifield",
- .id = PLATFORM_DEVID_NONE,
- .resource = &mrfld_pinctrl_mmio_resource,
- .num_resources = 1,
-};
-
-static int __init mrfld_pinctrl_init(void)
-{
- if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
- return platform_device_register(&mrfld_pinctrl_device);
-
- return -ENODEV;
-}
-arch_initcall(mrfld_pinctrl_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c
deleted file mode 100644
index ec2afb41b34a..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_power_btn.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield power button support
- *
- * (C) Copyright 2017 Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/platform_device.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-#include <asm/intel_scu_ipc.h>
-
-static struct resource mrfld_power_btn_resources[] = {
- {
- .flags = IORESOURCE_IRQ,
- },
-};
-
-static struct platform_device mrfld_power_btn_dev = {
- .name = "msic_power_btn",
- .id = PLATFORM_DEVID_NONE,
- .num_resources = ARRAY_SIZE(mrfld_power_btn_resources),
- .resource = mrfld_power_btn_resources,
-};
-
-static int mrfld_power_btn_scu_status_change(struct notifier_block *nb,
- unsigned long code, void *data)
-{
- if (code == SCU_DOWN) {
- platform_device_unregister(&mrfld_power_btn_dev);
- return 0;
- }
-
- return platform_device_register(&mrfld_power_btn_dev);
-}
-
-static struct notifier_block mrfld_power_btn_scu_notifier = {
- .notifier_call = mrfld_power_btn_scu_status_change,
-};
-
-static int __init register_mrfld_power_btn(void)
-{
- if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
- return -ENODEV;
-
- /*
- * We need to be sure that the SCU IPC is ready before
- * PMIC power button device can be registered:
- */
- intel_scu_notifier_add(&mrfld_power_btn_scu_notifier);
-
- return 0;
-}
-arch_initcall(register_mrfld_power_btn);
-
-static void __init *mrfld_power_btn_platform_data(void *info)
-{
- struct resource *res = mrfld_power_btn_resources;
- struct sfi_device_table_entry *pentry = info;
-
- res->start = res->end = pentry->irq;
- return NULL;
-}
-
-static const struct devs_id mrfld_power_btn_dev_id __initconst = {
- .name = "bcove_power_btn",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &mrfld_power_btn_platform_data,
-};
-
-sfi_device(mrfld_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
deleted file mode 100644
index 40e9808a9634..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_rtc.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield legacy RTC initialization file
- *
- * (C) Copyright 2017 Intel Corporation
- *
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-
-#include <asm/hw_irq.h>
-#include <asm/intel-mid.h>
-#include <asm/io_apic.h>
-#include <asm/time.h>
-#include <asm/x86_init.h>
-
-static int __init mrfld_legacy_rtc_alloc_irq(void)
-{
- struct irq_alloc_info info;
- int ret;
-
- if (!x86_platform.legacy.rtc)
- return -ENODEV;
-
- ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, 0);
- ret = mp_map_gsi_to_irq(RTC_IRQ, IOAPIC_MAP_ALLOC, &info);
- if (ret < 0) {
- pr_info("Failed to allocate RTC interrupt. Disabling RTC\n");
- x86_platform.legacy.rtc = 0;
- return ret;
- }
-
- return 0;
-}
-
-static int __init mrfld_legacy_rtc_init(void)
-{
- if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
- return -ENODEV;
-
- return mrfld_legacy_rtc_alloc_irq();
-}
-arch_initcall(mrfld_legacy_rtc_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
deleted file mode 100644
index fe3b7ff975f3..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * SDHCI platform data initilisation file
- *
- * (C) Copyright 2016 Intel Corporation
- * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/pci.h>
-
-#include <linux/mmc/sdhci-pci-data.h>
-
-#include <asm/intel-mid.h>
-
-#define INTEL_MRFLD_SD 2
-#define INTEL_MRFLD_SD_CD_GPIO 77
-
-static struct sdhci_pci_data mrfld_sdhci_pci_data = {
- .rst_n_gpio = -EINVAL,
- .cd_gpio = INTEL_MRFLD_SD_CD_GPIO,
-};
-
-static struct sdhci_pci_data *
-mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
-{
- unsigned int func = PCI_FUNC(pdev->devfn);
-
- if (func == INTEL_MRFLD_SD)
- return &mrfld_sdhci_pci_data;
-
- return NULL;
-}
-
-static int __init mrfld_sd_init(void)
-{
- if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
- return -ENODEV;
-
- sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
- return 0;
-}
-arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
deleted file mode 100644
index b828f4fd40be..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_spidev.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * spidev platform data initialization file
- *
- * (C) Copyright 2014, 2016 Intel Corporation
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- * Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/spi/pxa2xx_spi.h>
-#include <linux/spi/spi.h>
-
-#include <asm/intel-mid.h>
-
-#define MRFLD_SPI_DEFAULT_DMA_BURST 8
-#define MRFLD_SPI_DEFAULT_TIMEOUT 500
-
-/* GPIO pin for spidev chipselect */
-#define MRFLD_SPIDEV_GPIO_CS 111
-
-static struct pxa2xx_spi_chip spidev_spi_chip = {
- .dma_burst_size = MRFLD_SPI_DEFAULT_DMA_BURST,
- .timeout = MRFLD_SPI_DEFAULT_TIMEOUT,
- .gpio_cs = MRFLD_SPIDEV_GPIO_CS,
-};
-
-static void __init *spidev_platform_data(void *info)
-{
- struct spi_board_info *spi_info = info;
-
- if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
- return ERR_PTR(-ENODEV);
-
- spi_info->mode = SPI_MODE_0;
- spi_info->controller_data = &spidev_spi_chip;
-
- return NULL;
-}
-
-static const struct devs_id spidev_dev_id __initconst = {
- .name = "spidev",
- .type = SFI_DEV_TYPE_SPI,
- .delay = 0,
- .get_platform_data = &spidev_platform_data,
-};
-
-sfi_device(spidev_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
deleted file mode 100644
index 227218a8f98e..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Merrifield watchdog platform device library file
- *
- * (C) Copyright 2014 Intel Corporation
- * Author: David Cohen <david.a.cohen@linux.intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/platform_device.h>
-#include <linux/platform_data/intel-mid_wdt.h>
-
-#include <asm/intel-mid.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/io_apic.h>
-#include <asm/hw_irq.h>
-
-#define TANGIER_EXT_TIMER0_MSI 12
-
-static struct platform_device wdt_dev = {
- .name = "intel_mid_wdt",
- .id = -1,
-};
-
-static int tangier_probe(struct platform_device *pdev)
-{
- struct irq_alloc_info info;
- struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
- int gsi = TANGIER_EXT_TIMER0_MSI;
- int irq;
-
- if (!pdata)
- return -EINVAL;
-
- /* IOAPIC builds identity mapping between GSI and IRQ on MID */
- ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
- irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
- if (irq < 0) {
- dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n", gsi);
- return irq;
- }
-
- pdata->irq = irq;
- return 0;
-}
-
-static struct intel_mid_wdt_pdata tangier_pdata = {
- .probe = tangier_probe,
-};
-
-static int wdt_scu_status_change(struct notifier_block *nb,
- unsigned long code, void *data)
-{
- if (code == SCU_DOWN) {
- platform_device_unregister(&wdt_dev);
- return 0;
- }
-
- return platform_device_register(&wdt_dev);
-}
-
-static struct notifier_block wdt_scu_notifier = {
- .notifier_call = wdt_scu_status_change,
-};
-
-static int __init register_mid_wdt(void)
-{
- if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
- return -ENODEV;
-
- wdt_dev.dev.platform_data = &tangier_pdata;
-
- /*
- * We need to be sure that the SCU IPC is ready before watchdog device
- * can be registered:
- */
- intel_scu_notifier_add(&wdt_scu_notifier);
-
- return 0;
-}
-arch_initcall(register_mid_wdt);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
deleted file mode 100644
index b17783d0d4e7..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic.c: MSIC platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/intel-mid.h>
-#include "platform_msic.h"
-
-struct intel_msic_platform_data msic_pdata;
-
-static struct resource msic_resources[] = {
- {
- .start = INTEL_MSIC_IRQ_PHYS_BASE,
- .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1,
- .flags = IORESOURCE_MEM,
- },
-};
-
-static struct platform_device msic_device = {
- .name = "intel_msic",
- .id = -1,
- .dev = {
- .platform_data = &msic_pdata,
- },
- .num_resources = ARRAY_SIZE(msic_resources),
- .resource = msic_resources,
-};
-
-static int msic_scu_status_change(struct notifier_block *nb,
- unsigned long code, void *data)
-{
- if (code == SCU_DOWN) {
- platform_device_unregister(&msic_device);
- return 0;
- }
-
- return platform_device_register(&msic_device);
-}
-
-static int __init msic_init(void)
-{
- static struct notifier_block msic_scu_notifier = {
- .notifier_call = msic_scu_status_change,
- };
-
- /*
- * We need to be sure that the SCU IPC is ready before MSIC device
- * can be registered.
- */
- if (intel_mid_has_msic())
- intel_scu_notifier_add(&msic_scu_notifier);
-
- return 0;
-}
-arch_initcall(msic_init);
-
-/*
- * msic_generic_platform_data - sets generic platform data for the block
- * @info: pointer to the SFI device table entry for this block
- * @block: MSIC block
- *
- * Function sets IRQ number from the SFI table entry for given device to
- * the MSIC platform data.
- */
-void *msic_generic_platform_data(void *info, enum intel_msic_block block)
-{
- struct sfi_device_table_entry *entry = info;
-
- BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST);
- msic_pdata.irq[block] = entry->irq;
-
- return NULL;
-}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.h b/arch/x86/platform/intel-mid/device_libs/platform_msic.h
deleted file mode 100644
index 91deb2e65b0e..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * platform_msic.h: MSIC platform data header file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-#ifndef _PLATFORM_MSIC_H_
-#define _PLATFORM_MSIC_H_
-
-extern struct intel_msic_platform_data msic_pdata;
-
-void *msic_generic_platform_data(void *info, enum intel_msic_block block);
-
-#endif
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
deleted file mode 100644
index e765da78ad8c..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_audio.c: MSIC audio platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/platform_device.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void *msic_audio_platform_data(void *info)
-{
- struct platform_device *pdev;
-
- pdev = platform_device_register_simple("sst-platform", -1, NULL, 0);
-
- if (IS_ERR(pdev)) {
- pr_err("failed to create audio platform device\n");
- return NULL;
- }
-
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO);
-}
-
-static const struct devs_id msic_audio_dev_id __initconst = {
- .name = "msic_audio",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_audio_platform_data,
-};
-
-sfi_device(msic_audio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
deleted file mode 100644
index f461f84903f8..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_battery.c: MSIC battery platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void __init *msic_battery_platform_data(void *info)
-{
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY);
-}
-
-static const struct devs_id msic_battery_dev_id __initconst = {
- .name = "msic_battery",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_battery_platform_data,
-};
-
-sfi_device(msic_battery_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
deleted file mode 100644
index 71a7d6db3878..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_gpio.c: MSIC GPIO platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void __init *msic_gpio_platform_data(void *info)
-{
- static struct intel_msic_gpio_pdata msic_gpio_pdata;
-
- int gpio = get_gpio_by_name("msic_gpio_base");
-
- if (gpio < 0)
- return NULL;
-
- msic_gpio_pdata.gpio_base = gpio;
- msic_pdata.gpio = &msic_gpio_pdata;
-
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO);
-}
-
-static const struct devs_id msic_gpio_dev_id __initconst = {
- .name = "msic_gpio",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_gpio_platform_data,
-};
-
-sfi_device(msic_gpio_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
deleted file mode 100644
index 558c0d974430..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_ocd.c: MSIC OCD platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/init.h>
-#include <linux/gpio.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void __init *msic_ocd_platform_data(void *info)
-{
- static struct intel_msic_ocd_pdata msic_ocd_pdata;
- int gpio;
-
- gpio = get_gpio_by_name("ocd_gpio");
-
- if (gpio < 0)
- return NULL;
-
- msic_ocd_pdata.gpio = gpio;
- msic_pdata.ocd = &msic_ocd_pdata;
-
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD);
-}
-
-static const struct devs_id msic_ocd_dev_id __initconst = {
- .name = "msic_ocd",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_ocd_platform_data,
-};
-
-sfi_device(msic_ocd_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
deleted file mode 100644
index 3d3de2d59726..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_power_btn.c: MSIC power btn platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/init.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void __init *msic_power_btn_platform_data(void *info)
-{
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN);
-}
-
-static const struct devs_id msic_power_btn_dev_id __initconst = {
- .name = "msic_power_btn",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_power_btn_platform_data,
-};
-
-sfi_device(msic_power_btn_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
deleted file mode 100644
index 4858da1d78c6..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_msic_thermal.c: msic_thermal platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/input.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/gpio.h>
-#include <linux/platform_device.h>
-#include <linux/mfd/intel_msic.h>
-#include <asm/intel-mid.h>
-
-#include "platform_msic.h"
-
-static void __init *msic_thermal_platform_data(void *info)
-{
- return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL);
-}
-
-static const struct devs_id msic_thermal_dev_id __initconst = {
- .name = "msic_thermal",
- .type = SFI_DEV_TYPE_IPC,
- .delay = 1,
- .msic = 1,
- .get_platform_data = &msic_thermal_platform_data,
-};
-
-sfi_device(msic_thermal_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c b/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
deleted file mode 100644
index 5609d8da3978..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_pcal9555a.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * PCAL9555a platform data initialization file
- *
- * Copyright (C) 2016, Intel Corporation
- *
- * Authors: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
- * Dan O'Donovan <dan@emutex.com>
- */
-
-#include <linux/gpio.h>
-#include <linux/init.h>
-#include <linux/i2c.h>
-#include <linux/platform_data/pca953x.h>
-#include <linux/sfi.h>
-
-#include <asm/intel-mid.h>
-
-#define PCAL9555A_NUM 4
-
-static struct pca953x_platform_data pcal9555a_pdata[PCAL9555A_NUM];
-static int nr;
-
-static void __init *pcal9555a_platform_data(void *info)
-{
- struct i2c_board_info *i2c_info = info;
- char *type = i2c_info->type;
- struct pca953x_platform_data *pcal9555a;
- char base_pin_name[SFI_NAME_LEN + 1];
- char intr_pin_name[SFI_NAME_LEN + 1];
- int gpio_base, intr;
-
- snprintf(base_pin_name, sizeof(base_pin_name), "%s_base", type);
- snprintf(intr_pin_name, sizeof(intr_pin_name), "%s_int", type);
-
- gpio_base = get_gpio_by_name(base_pin_name);
- intr = get_gpio_by_name(intr_pin_name);
-
- /* Check if the SFI record valid */
- if (gpio_base == -1)
- return NULL;
-
- if (nr >= PCAL9555A_NUM) {
- pr_err("%s: Too many instances, only %d supported\n", __func__,
- PCAL9555A_NUM);
- return NULL;
- }
-
- pcal9555a = &pcal9555a_pdata[nr++];
- pcal9555a->gpio_base = gpio_base;
-
- if (intr >= 0) {
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- pcal9555a->irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
- } else {
- i2c_info->irq = -1;
- pcal9555a->irq_base = -1;
- }
-
- strcpy(type, "pcal9555a");
- return pcal9555a;
-}
-
-static const struct devs_id pcal9555a_1_dev_id __initconst = {
- .name = "pcal9555a-1",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_2_dev_id __initconst = {
- .name = "pcal9555a-2",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_3_dev_id __initconst = {
- .name = "pcal9555a-3",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &pcal9555a_platform_data,
-};
-
-static const struct devs_id pcal9555a_4_dev_id __initconst = {
- .name = "pcal9555a-4",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &pcal9555a_platform_data,
-};
-
-sfi_device(pcal9555a_1_dev_id);
-sfi_device(pcal9555a_2_dev_id);
-sfi_device(pcal9555a_3_dev_id);
-sfi_device(pcal9555a_4_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
deleted file mode 100644
index 139738bbdd36..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tc35876x.c: tc35876x platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/gpio/machine.h>
-#include <asm/intel-mid.h>
-
-static struct gpiod_lookup_table tc35876x_gpio_table = {
- .dev_id = "i2c_disp_brig",
- .table = {
- GPIO_LOOKUP("0000:00:0c.0", -1, "bridge-reset", GPIO_ACTIVE_HIGH),
- GPIO_LOOKUP("0000:00:0c.0", -1, "bl-en", GPIO_ACTIVE_HIGH),
- GPIO_LOOKUP("0000:00:0c.0", -1, "vadd", GPIO_ACTIVE_HIGH),
- { },
- },
-};
-
-/*tc35876x DSI_LVDS bridge chip and panel platform data*/
-static void *tc35876x_platform_data(void *data)
-{
- struct gpiod_lookup_table *table = &tc35876x_gpio_table;
- struct gpiod_lookup *lookup = table->table;
-
- lookup[0].chip_hwnum = get_gpio_by_name("LCMB_RXEN");
- lookup[1].chip_hwnum = get_gpio_by_name("6S6P_BL_EN");
- lookup[2].chip_hwnum = get_gpio_by_name("EN_VREG_LCD_V3P3");
- gpiod_add_lookup_table(table);
-
- return NULL;
-}
-
-static const struct devs_id tc35876x_dev_id __initconst = {
- .name = "i2c_disp_brig",
- .type = SFI_DEV_TYPE_I2C,
- .get_platform_data = &tc35876x_platform_data,
-};
-
-sfi_device(tc35876x_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
deleted file mode 100644
index e689d8f61059..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * platform_tca6416.c: tca6416 platform data initialization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/platform_data/pca953x.h>
-#include <linux/i2c.h>
-#include <linux/gpio.h>
-#include <asm/intel-mid.h>
-
-#define TCA6416_NAME "tca6416"
-#define TCA6416_BASE "tca6416_base"
-#define TCA6416_INTR "tca6416_int"
-
-static void *tca6416_platform_data(void *info)
-{
- static struct pca953x_platform_data tca6416;
- struct i2c_board_info *i2c_info = info;
- int gpio_base, intr;
- char base_pin_name[SFI_NAME_LEN + 1];
- char intr_pin_name[SFI_NAME_LEN + 1];
-
- strcpy(i2c_info->type, TCA6416_NAME);
- strcpy(base_pin_name, TCA6416_BASE);
- strcpy(intr_pin_name, TCA6416_INTR);
-
- gpio_base = get_gpio_by_name(base_pin_name);
- intr = get_gpio_by_name(intr_pin_name);
-
- if (gpio_base < 0)
- return NULL;
- tca6416.gpio_base = gpio_base;
- if (intr >= 0) {
- i2c_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- tca6416.irq_base = gpio_base + INTEL_MID_IRQ_OFFSET;
- } else {
- i2c_info->irq = -1;
- tca6416.irq_base = -1;
- }
- return &tca6416;
-}
-
-static const struct devs_id tca6416_dev_id __initconst = {
- .name = "tca6416",
- .type = SFI_DEV_TYPE_I2C,
- .delay = 1,
- .get_platform_data = &tca6416_platform_data,
-};
-
-sfi_device(tca6416_dev_id);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 780728161f7d..f4592dc7a1c1 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * intel-mid.c: Intel MID platform setup code
+ * Intel MID platform setup code
*
- * (C) Copyright 2008, 2012 Intel Corporation
+ * (C) Copyright 2008, 2012, 2021 Intel Corporation
* Author: Jacob Pan (jacob.jun.pan@intel.com)
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
*/
@@ -14,7 +14,6 @@
#include <linux/interrupt.h>
#include <linux/regulator/machine.h>
#include <linux/scatterlist.h>
-#include <linux/sfi.h>
#include <linux/irq.h>
#include <linux/export.h>
#include <linux/notifier.h>
@@ -25,38 +24,13 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/intel-mid.h>
-#include <asm/intel_mid_vrtc.h>
#include <asm/io.h>
#include <asm/i8259.h>
#include <asm/intel_scu_ipc.h>
-#include <asm/apb_timer.h>
#include <asm/reboot.h>
-/*
- * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
- * cmdline option x86_intel_mid_timer can be used to override the configuration
- * to prefer one or the other.
- * at runtime, there are basically three timer configurations:
- * 1. per cpu apbt clock only
- * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
- * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
- *
- * by default (without cmdline option), platform code first detects cpu type
- * to see if we are on lincroft or penwell, then set up both lapic or apbt
- * clocks accordingly.
- * i.e. by default, medfield uses configuration #2, moorestown uses #1.
- * config #3 is supported but not recommended on medfield.
- *
- * rating and feature summary:
- * lapic (with C3STOP) --------- 100
- * apbt (always-on) ------------ 110
- * lapic (always-on,ARAT) ------ 150
- */
-
-enum intel_mid_timer_options intel_mid_timer_options;
-
-enum intel_mid_cpu_type __intel_mid_cpu_chip;
-EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
+#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
+#define IPCMSG_COLD_RESET 0xF1
static void intel_mid_power_off(void)
{
@@ -64,69 +38,32 @@ static void intel_mid_power_off(void)
intel_mid_pwr_power_off();
/* Only for Tangier, the rest will ignore this command */
- intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1);
+ intel_scu_ipc_dev_simple_command(NULL, IPCMSG_COLD_OFF, 1);
};
static void intel_mid_reboot(void)
{
- intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
-}
-
-static void __init intel_mid_setup_bp_timer(void)
-{
- apbt_time_init();
- setup_boot_APIC_clock();
+ intel_scu_ipc_dev_simple_command(NULL, IPCMSG_COLD_RESET, 0);
}
static void __init intel_mid_time_init(void)
{
- sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
-
- switch (intel_mid_timer_options) {
- case INTEL_MID_TIMER_APBT_ONLY:
- break;
- case INTEL_MID_TIMER_LAPIC_APBT:
- /* Use apbt and local apic */
- x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
- x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
- return;
- default:
- if (!boot_cpu_has(X86_FEATURE_ARAT))
- break;
- /* Lapic only, no apbt */
- x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
- x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
- return;
- }
-
- x86_init.timers.setup_percpu_clockev = apbt_time_init;
+ /* Lapic only, no apbt */
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
}
static void intel_mid_arch_setup(void)
{
- if (boot_cpu_data.x86 != 6) {
- pr_err("Unknown Intel MID CPU (%d:%d), default to Penwell\n",
- boot_cpu_data.x86, boot_cpu_data.x86_model);
- __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
- goto out;
- }
-
switch (boot_cpu_data.x86_model) {
- case 0x35:
- __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_CLOVERVIEW;
- break;
case 0x3C:
case 0x4A:
- __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_TANGIER;
x86_platform.legacy.rtc = 1;
break;
- case 0x27:
default:
- __intel_mid_cpu_chip = INTEL_MID_CPU_CHIP_PENWELL;
break;
}
-out:
/*
* Intel MID platforms are using explicitly defined regulators.
*
@@ -159,14 +96,11 @@ void __init x86_intel_mid_early_setup(void)
x86_init.timers.timer_init = intel_mid_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
- x86_init.timers.wallclock_init = intel_mid_rtc_init;
x86_init.irqs.pre_vector_init = x86_init_noop;
x86_init.oem.arch_setup = intel_mid_arch_setup;
- x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
-
x86_platform.get_nmi_reason = intel_mid_get_nmi_reason;
x86_init.pci.arch_init = intel_mid_pci_init;
@@ -188,25 +122,3 @@ void __init x86_intel_mid_early_setup(void)
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
set_bit(MP_BUS_ISA, mp_bus_not_pci);
}
-
-/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_intel_mid_timer(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp("apbt_only", arg) == 0)
- intel_mid_timer_options = INTEL_MID_TIMER_APBT_ONLY;
- else if (strcmp("lapic_and_apbt", arg) == 0)
- intel_mid_timer_options = INTEL_MID_TIMER_LAPIC_APBT;
- else {
- pr_warn("X86 INTEL_MID timer option %s not recognised use x86_intel_mid_timer=apbt_only or lapic_and_apbt\n",
- arg);
- return -EINVAL;
- }
- return 0;
-}
-__setup("x86_intel_mid_timer=", setup_x86_intel_mid_timer);
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
deleted file mode 100644
index 2226da4f437a..000000000000
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ /dev/null
@@ -1,173 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * intel_mid_vrtc.c: Driver for virtual RTC device on Intel MID platform
- *
- * (C) Copyright 2009 Intel Corporation
- *
- * Note:
- * VRTC is emulated by system controller firmware, the real HW
- * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
- * in a memory mapped IO space that is visible to the host IA
- * processor.
- *
- * This driver is based on RTC CMOS driver.
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/platform_device.h>
-#include <linux/mc146818rtc.h>
-
-#include <asm/intel-mid.h>
-#include <asm/intel_mid_vrtc.h>
-#include <asm/time.h>
-#include <asm/fixmap.h>
-
-static unsigned char __iomem *vrtc_virt_base;
-
-unsigned char vrtc_cmos_read(unsigned char reg)
-{
- unsigned char retval;
-
- /* vRTC's registers range from 0x0 to 0xD */
- if (reg > 0xd || !vrtc_virt_base)
- return 0xff;
-
- lock_cmos_prefix(reg);
- retval = __raw_readb(vrtc_virt_base + (reg << 2));
- lock_cmos_suffix(reg);
- return retval;
-}
-EXPORT_SYMBOL_GPL(vrtc_cmos_read);
-
-void vrtc_cmos_write(unsigned char val, unsigned char reg)
-{
- if (reg > 0xd || !vrtc_virt_base)
- return;
-
- lock_cmos_prefix(reg);
- __raw_writeb(val, vrtc_virt_base + (reg << 2));
- lock_cmos_suffix(reg);
-}
-EXPORT_SYMBOL_GPL(vrtc_cmos_write);
-
-void vrtc_get_time(struct timespec64 *now)
-{
- u8 sec, min, hour, mday, mon;
- unsigned long flags;
- u32 year;
-
- spin_lock_irqsave(&rtc_lock, flags);
-
- while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
- cpu_relax();
-
- sec = vrtc_cmos_read(RTC_SECONDS);
- min = vrtc_cmos_read(RTC_MINUTES);
- hour = vrtc_cmos_read(RTC_HOURS);
- mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
- mon = vrtc_cmos_read(RTC_MONTH);
- year = vrtc_cmos_read(RTC_YEAR);
-
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- /* vRTC YEAR reg contains the offset to 1972 */
- year += 1972;
-
- pr_info("vRTC: sec: %d min: %d hour: %d day: %d "
- "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
-
- now->tv_sec = mktime64(year, mon, mday, hour, min, sec);
- now->tv_nsec = 0;
-}
-
-int vrtc_set_mmss(const struct timespec64 *now)
-{
- unsigned long flags;
- struct rtc_time tm;
- int year;
- int retval = 0;
-
- rtc_time64_to_tm(now->tv_sec, &tm);
- if (!rtc_valid_tm(&tm) && tm.tm_year >= 72) {
- /*
- * tm.year is the number of years since 1900, and the
- * vrtc need the years since 1972.
- */
- year = tm.tm_year - 72;
- spin_lock_irqsave(&rtc_lock, flags);
- vrtc_cmos_write(year, RTC_YEAR);
- vrtc_cmos_write(tm.tm_mon, RTC_MONTH);
- vrtc_cmos_write(tm.tm_mday, RTC_DAY_OF_MONTH);
- vrtc_cmos_write(tm.tm_hour, RTC_HOURS);
- vrtc_cmos_write(tm.tm_min, RTC_MINUTES);
- vrtc_cmos_write(tm.tm_sec, RTC_SECONDS);
- spin_unlock_irqrestore(&rtc_lock, flags);
- } else {
- pr_err("%s: Invalid vRTC value: write of %llx to vRTC failed\n",
- __func__, (s64)now->tv_sec);
- retval = -EINVAL;
- }
- return retval;
-}
-
-void __init intel_mid_rtc_init(void)
-{
- unsigned long vrtc_paddr;
-
- sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-
- vrtc_paddr = sfi_mrtc_array[0].phys_addr;
- if (!sfi_mrtc_num || !vrtc_paddr)
- return;
-
- vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
- vrtc_paddr);
- x86_platform.get_wallclock = vrtc_get_time;
- x86_platform.set_wallclock = vrtc_set_mmss;
-}
-
-/*
- * The Moorestown platform has a memory mapped virtual RTC device that emulates
- * the programming interface of the RTC.
- */
-
-static struct resource vrtc_resources[] = {
- [0] = {
- .flags = IORESOURCE_MEM,
- },
- [1] = {
- .flags = IORESOURCE_IRQ,
- }
-};
-
-static struct platform_device vrtc_device = {
- .name = "rtc_mrst",
- .id = -1,
- .resource = vrtc_resources,
- .num_resources = ARRAY_SIZE(vrtc_resources),
-};
-
-/* Register the RTC device if appropriate */
-static int __init intel_mid_device_create(void)
-{
- /* No Moorestown, no device */
- if (!intel_mid_identify_cpu())
- return -ENODEV;
- /* No timer, no device */
- if (!sfi_mrtc_num)
- return -ENODEV;
-
- /* iomem resource */
- vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
- vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
- MRST_VRTC_MAP_SZ;
- /* irq resource */
- vrtc_resources[1].start = sfi_mrtc_array[0].irq;
- vrtc_resources[1].end = sfi_mrtc_array[0].irq;
-
- return platform_device_register(&vrtc_device);
-}
-device_initcall(intel_mid_device_create);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
deleted file mode 100644
index 30bd5714a3d4..000000000000
--- a/arch/x86/platform/intel-mid/sfi.c
+++ /dev/null
@@ -1,543 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * intel_mid_sfi.c: Intel MID SFI initialization code
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/scatterlist.h>
-#include <linux/sfi.h>
-#include <linux/spi/spi.h>
-#include <linux/i2c.h>
-#include <linux/skbuff.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
-#include <linux/export.h>
-#include <linux/notifier.h>
-#include <linux/mmc/core.h>
-#include <linux/mmc/card.h>
-#include <linux/blkdev.h>
-
-#include <asm/setup.h>
-#include <asm/mpspec_def.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/io_apic.h>
-#include <asm/intel-mid.h>
-#include <asm/intel_mid_vrtc.h>
-#include <asm/io.h>
-#include <asm/i8259.h>
-#include <asm/intel_scu_ipc.h>
-#include <asm/apb_timer.h>
-#include <asm/reboot.h>
-
-#define SFI_SIG_OEM0 "OEM0"
-#define MAX_IPCDEVS 24
-#define MAX_SCU_SPI 24
-#define MAX_SCU_I2C 24
-
-static struct platform_device *ipc_devs[MAX_IPCDEVS];
-static struct spi_board_info *spi_devs[MAX_SCU_SPI];
-static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
-static struct sfi_gpio_table_entry *gpio_table;
-static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
-static int ipc_next_dev;
-static int spi_next_dev;
-static int i2c_next_dev;
-static int i2c_bus[MAX_SCU_I2C];
-static int gpio_num_entry;
-static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
-int sfi_mrtc_num;
-int sfi_mtimer_num;
-
-struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
-EXPORT_SYMBOL_GPL(sfi_mrtc_array);
-
-struct blocking_notifier_head intel_scu_notifier =
- BLOCKING_NOTIFIER_INIT(intel_scu_notifier);
-EXPORT_SYMBOL_GPL(intel_scu_notifier);
-
-#define intel_mid_sfi_get_pdata(dev, priv) \
- ((dev)->get_platform_data ? (dev)->get_platform_data(priv) : NULL)
-
-/* parse all the mtimer info to a static mtimer array */
-int __init sfi_parse_mtmr(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_timer_table_entry *pentry;
- struct mpc_intsrc mp_irq;
- int totallen;
-
- sb = (struct sfi_table_simple *)table;
- if (!sfi_mtimer_num) {
- sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
- struct sfi_timer_table_entry);
- pentry = (struct sfi_timer_table_entry *) sb->pentry;
- totallen = sfi_mtimer_num * sizeof(*pentry);
- memcpy(sfi_mtimer_array, pentry, totallen);
- }
-
- pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
- pentry = sfi_mtimer_array;
- for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
- pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
- totallen, (u32)pentry->phys_addr,
- pentry->freq_hz, pentry->irq);
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = MP_IRQTRIG_EDGE | MP_IRQPOL_ACTIVE_HIGH;
- mp_irq.srcbus = MP_BUS_ISA;
- mp_irq.srcbusirq = pentry->irq; /* IRQ */
- mp_irq.dstapic = MP_APIC_ALL;
- mp_irq.dstirq = pentry->irq;
- mp_save_irq(&mp_irq);
- mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
- }
-
- return 0;
-}
-
-struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
-{
- int i;
- if (hint < sfi_mtimer_num) {
- if (!sfi_mtimer_usage[hint]) {
- pr_debug("hint taken for timer %d irq %d\n",
- hint, sfi_mtimer_array[hint].irq);
- sfi_mtimer_usage[hint] = 1;
- return &sfi_mtimer_array[hint];
- }
- }
- /* take the first timer available */
- for (i = 0; i < sfi_mtimer_num;) {
- if (!sfi_mtimer_usage[i]) {
- sfi_mtimer_usage[i] = 1;
- return &sfi_mtimer_array[i];
- }
- i++;
- }
- return NULL;
-}
-
-void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
-{
- int i;
- for (i = 0; i < sfi_mtimer_num;) {
- if (mtmr->irq == sfi_mtimer_array[i].irq) {
- sfi_mtimer_usage[i] = 0;
- return;
- }
- i++;
- }
-}
-
-/* parse all the mrtc info to a global mrtc array */
-int __init sfi_parse_mrtc(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_rtc_table_entry *pentry;
- struct mpc_intsrc mp_irq;
-
- int totallen;
-
- sb = (struct sfi_table_simple *)table;
- if (!sfi_mrtc_num) {
- sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
- struct sfi_rtc_table_entry);
- pentry = (struct sfi_rtc_table_entry *)sb->pentry;
- totallen = sfi_mrtc_num * sizeof(*pentry);
- memcpy(sfi_mrtc_array, pentry, totallen);
- }
-
- pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
- pentry = sfi_mrtc_array;
- for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
- pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
- totallen, (u32)pentry->phys_addr, pentry->irq);
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = MP_IRQTRIG_LEVEL | MP_IRQPOL_ACTIVE_LOW;
- mp_irq.srcbus = MP_BUS_ISA;
- mp_irq.srcbusirq = pentry->irq; /* IRQ */
- mp_irq.dstapic = MP_APIC_ALL;
- mp_irq.dstirq = pentry->irq;
- mp_save_irq(&mp_irq);
- mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
- }
- return 0;
-}
-
-
-/*
- * Parsing GPIO table first, since the DEVS table will need this table
- * to map the pin name to the actual pin.
- */
-static int __init sfi_parse_gpio(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_gpio_table_entry *pentry;
- int num, i;
-
- if (gpio_table)
- return 0;
- sb = (struct sfi_table_simple *)table;
- num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
- pentry = (struct sfi_gpio_table_entry *)sb->pentry;
-
- gpio_table = kmemdup(pentry, num * sizeof(*pentry), GFP_KERNEL);
- if (!gpio_table)
- return -1;
- gpio_num_entry = num;
-
- pr_debug("GPIO pin info:\n");
- for (i = 0; i < num; i++, pentry++)
- pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
- " pin = %d\n", i,
- pentry->controller_name,
- pentry->pin_name,
- pentry->pin_no);
- return 0;
-}
-
-int get_gpio_by_name(const char *name)
-{
- struct sfi_gpio_table_entry *pentry = gpio_table;
- int i;
-
- if (!pentry)
- return -1;
- for (i = 0; i < gpio_num_entry; i++, pentry++) {
- if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
- return pentry->pin_no;
- }
- return -EINVAL;
-}
-
-static void __init intel_scu_ipc_device_register(struct platform_device *pdev)
-{
- if (ipc_next_dev == MAX_IPCDEVS)
- pr_err("too many SCU IPC devices");
- else
- ipc_devs[ipc_next_dev++] = pdev;
-}
-
-static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
-{
- struct spi_board_info *new_dev;
-
- if (spi_next_dev == MAX_SCU_SPI) {
- pr_err("too many SCU SPI devices");
- return;
- }
-
- new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
- if (!new_dev) {
- pr_err("failed to alloc mem for delayed spi dev %s\n",
- sdev->modalias);
- return;
- }
- *new_dev = *sdev;
-
- spi_devs[spi_next_dev++] = new_dev;
-}
-
-static void __init intel_scu_i2c_device_register(int bus,
- struct i2c_board_info *idev)
-{
- struct i2c_board_info *new_dev;
-
- if (i2c_next_dev == MAX_SCU_I2C) {
- pr_err("too many SCU I2C devices");
- return;
- }
-
- new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
- if (!new_dev) {
- pr_err("failed to alloc mem for delayed i2c dev %s\n",
- idev->type);
- return;
- }
- *new_dev = *idev;
-
- i2c_bus[i2c_next_dev] = bus;
- i2c_devs[i2c_next_dev++] = new_dev;
-}
-
-/* Called by IPC driver */
-void intel_scu_devices_create(void)
-{
- int i;
-
- for (i = 0; i < ipc_next_dev; i++)
- platform_device_add(ipc_devs[i]);
-
- for (i = 0; i < spi_next_dev; i++)
- spi_register_board_info(spi_devs[i], 1);
-
- for (i = 0; i < i2c_next_dev; i++) {
- struct i2c_adapter *adapter;
- struct i2c_client *client;
-
- adapter = i2c_get_adapter(i2c_bus[i]);
- if (adapter) {
- client = i2c_new_client_device(adapter, i2c_devs[i]);
- if (IS_ERR(client))
- pr_err("can't create i2c device %s\n",
- i2c_devs[i]->type);
- } else
- i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
- }
- intel_scu_notifier_post(SCU_AVAILABLE, NULL);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_create);
-
-/* Called by IPC driver */
-void intel_scu_devices_destroy(void)
-{
- int i;
-
- intel_scu_notifier_post(SCU_DOWN, NULL);
-
- for (i = 0; i < ipc_next_dev; i++)
- platform_device_del(ipc_devs[i]);
-}
-EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
-
-static void __init install_irq_resource(struct platform_device *pdev, int irq)
-{
- /* Single threaded */
- static struct resource res __initdata = {
- .name = "IRQ",
- .flags = IORESOURCE_IRQ,
- };
- res.start = irq;
- platform_device_add_resources(pdev, &res, 1);
-}
-
-static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *pentry,
- struct devs_id *dev)
-{
- struct platform_device *pdev;
- void *pdata = NULL;
-
- pr_debug("IPC bus, name = %16.16s, irq = 0x%2x\n",
- pentry->name, pentry->irq);
-
- /*
- * We need to call platform init of IPC devices to fill misc_pdata
- * structure. It will be used in msic_init for initialization.
- */
- pdata = intel_mid_sfi_get_pdata(dev, pentry);
- if (IS_ERR(pdata))
- return;
-
- /*
- * On Medfield the platform device creation is handled by the MSIC
- * MFD driver so we don't need to do it here.
- */
- if (dev->msic && intel_mid_has_msic())
- return;
-
- pdev = platform_device_alloc(pentry->name, 0);
- if (pdev == NULL) {
- pr_err("out of memory for SFI platform device '%s'.\n",
- pentry->name);
- return;
- }
- install_irq_resource(pdev, pentry->irq);
-
- pdev->dev.platform_data = pdata;
- if (dev->delay)
- intel_scu_ipc_device_register(pdev);
- else
- platform_device_add(pdev);
-}
-
-static void __init sfi_handle_spi_dev(struct sfi_device_table_entry *pentry,
- struct devs_id *dev)
-{
- struct spi_board_info spi_info;
- void *pdata = NULL;
-
- memset(&spi_info, 0, sizeof(spi_info));
- strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
- spi_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
- spi_info.bus_num = pentry->host_num;
- spi_info.chip_select = pentry->addr;
- spi_info.max_speed_hz = pentry->max_freq;
- pr_debug("SPI bus=%d, name=%16.16s, irq=0x%2x, max_freq=%d, cs=%d\n",
- spi_info.bus_num,
- spi_info.modalias,
- spi_info.irq,
- spi_info.max_speed_hz,
- spi_info.chip_select);
-
- pdata = intel_mid_sfi_get_pdata(dev, &spi_info);
- if (IS_ERR(pdata))
- return;
-
- spi_info.platform_data = pdata;
- if (dev->delay)
- intel_scu_spi_device_register(&spi_info);
- else
- spi_register_board_info(&spi_info, 1);
-}
-
-static void __init sfi_handle_i2c_dev(struct sfi_device_table_entry *pentry,
- struct devs_id *dev)
-{
- struct i2c_board_info i2c_info;
- void *pdata = NULL;
-
- memset(&i2c_info, 0, sizeof(i2c_info));
- strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
- i2c_info.irq = ((pentry->irq == (u8)0xff) ? 0 : pentry->irq);
- i2c_info.addr = pentry->addr;
- pr_debug("I2C bus = %d, name = %16.16s, irq = 0x%2x, addr = 0x%x\n",
- pentry->host_num,
- i2c_info.type,
- i2c_info.irq,
- i2c_info.addr);
- pdata = intel_mid_sfi_get_pdata(dev, &i2c_info);
- i2c_info.platform_data = pdata;
- if (IS_ERR(pdata))
- return;
-
- if (dev->delay)
- intel_scu_i2c_device_register(pentry->host_num, &i2c_info);
- else
- i2c_register_board_info(pentry->host_num, &i2c_info, 1);
-}
-
-static void __init sfi_handle_sd_dev(struct sfi_device_table_entry *pentry,
- struct devs_id *dev)
-{
- struct mid_sd_board_info sd_info;
- void *pdata;
-
- memset(&sd_info, 0, sizeof(sd_info));
- strncpy(sd_info.name, pentry->name, SFI_NAME_LEN);
- sd_info.bus_num = pentry->host_num;
- sd_info.max_clk = pentry->max_freq;
- sd_info.addr = pentry->addr;
- pr_debug("SD bus = %d, name = %16.16s, max_clk = %d, addr = 0x%x\n",
- sd_info.bus_num,
- sd_info.name,
- sd_info.max_clk,
- sd_info.addr);
- pdata = intel_mid_sfi_get_pdata(dev, &sd_info);
- if (IS_ERR(pdata))
- return;
-
- /* Nothing we can do with this for now */
- sd_info.platform_data = pdata;
-
- pr_debug("Successfully registered %16.16s", sd_info.name);
-}
-
-extern struct devs_id *const __x86_intel_mid_dev_start[],
- *const __x86_intel_mid_dev_end[];
-
-static struct devs_id __init *get_device_id(u8 type, char *name)
-{
- struct devs_id *const *dev_table;
-
- for (dev_table = __x86_intel_mid_dev_start;
- dev_table < __x86_intel_mid_dev_end; dev_table++) {
- struct devs_id *dev = *dev_table;
- if (dev->type == type &&
- !strncmp(dev->name, name, SFI_NAME_LEN)) {
- return dev;
- }
- }
-
- return NULL;
-}
-
-static int __init sfi_parse_devs(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_device_table_entry *pentry;
- struct devs_id *dev = NULL;
- int num, i, ret;
- int polarity;
- struct irq_alloc_info info;
-
- sb = (struct sfi_table_simple *)table;
- num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
- pentry = (struct sfi_device_table_entry *)sb->pentry;
-
- for (i = 0; i < num; i++, pentry++) {
- int irq = pentry->irq;
-
- if (irq != (u8)0xff) { /* native RTE case */
- /* these SPI2 devices are not exposed to system as PCI
- * devices, but they have separate RTE entry in IOAPIC
- * so we have to enable them one by one here
- */
- if (intel_mid_identify_cpu() ==
- INTEL_MID_CPU_CHIP_TANGIER) {
- if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
- /* active low */
- polarity = 1;
- else if (!strncmp(pentry->name,
- "synaptics_3202", 14))
- /* active low */
- polarity = 1;
- else if (irq == 41)
- /* fast_int_1 */
- polarity = 1;
- else
- /* active high */
- polarity = 0;
- } else {
- /* PNW and CLV go with active low */
- polarity = 1;
- }
-
- ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
- ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
- WARN_ON(ret < 0);
- }
-
- dev = get_device_id(pentry->type, pentry->name);
-
- if (!dev)
- continue;
-
- switch (pentry->type) {
- case SFI_DEV_TYPE_IPC:
- sfi_handle_ipc_dev(pentry, dev);
- break;
- case SFI_DEV_TYPE_SPI:
- sfi_handle_spi_dev(pentry, dev);
- break;
- case SFI_DEV_TYPE_I2C:
- sfi_handle_i2c_dev(pentry, dev);
- break;
- case SFI_DEV_TYPE_SD:
- sfi_handle_sd_dev(pentry, dev);
- break;
- case SFI_DEV_TYPE_UART:
- case SFI_DEV_TYPE_HSI:
- default:
- break;
- }
- }
- return 0;
-}
-
-static int __init intel_mid_platform_init(void)
-{
- sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
- sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
- return 0;
-}
-arch_initcall(intel_mid_platform_init);
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ac8578258af..b42bfdab01a9 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -27,7 +27,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
-MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
static bool force;
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 43b4d864817e..d2ccadc247e6 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -16,6 +16,7 @@
#include <asm/boot.h>
#include <asm/processor-flags.h>
#include <asm/msr.h>
+#include <asm/nospec-branch.h>
#include <xen/interface/elfnote.h>
__HEAD
@@ -105,6 +106,7 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
/* startup_64 expects boot_params in %rsi. */
mov $_pa(pvh_bootparams), %rsi
mov $_pa(startup_64), %rax
+ ANNOTATE_RETPOLINE_SAFE
jmp *%rax
#else /* CONFIG_X86_64 */
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
deleted file mode 100644
index 4eba24c2af67..000000000000
--- a/arch/x86/platform/sfi/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_SFI) += sfi.o
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
deleted file mode 100644
index 6259563760f9..000000000000
--- a/arch/x86/platform/sfi/sfi.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sfi.c - x86 architecture SFI support.
- *
- * Copyright (c) 2009, Intel Corporation.
- */
-
-#define KMSG_COMPONENT "SFI"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/sfi.h>
-#include <linux/io.h>
-
-#include <asm/irqdomain.h>
-#include <asm/io_apic.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-
-/* All CPUs enumerated by SFI must be present and enabled */
-static void __init mp_sfi_register_lapic(u8 id)
-{
- if (MAX_LOCAL_APIC - id <= 0) {
- pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC);
- return;
- }
-
- pr_info("registering lapic[%d]\n", id);
-
- generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
-}
-
-static int __init sfi_parse_cpus(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_cpu_table_entry *pentry;
- int i;
- int cpu_num;
-
- sb = (struct sfi_table_simple *)table;
- cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
- pentry = (struct sfi_cpu_table_entry *)sb->pentry;
-
- for (i = 0; i < cpu_num; i++) {
- mp_sfi_register_lapic(pentry->apic_id);
- pentry++;
- }
-
- smp_found_config = 1;
- return 0;
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
-#ifdef CONFIG_X86_IO_APIC
-
-static int __init sfi_parse_ioapic(struct sfi_table_header *table)
-{
- struct sfi_table_simple *sb;
- struct sfi_apic_table_entry *pentry;
- int i, num;
- struct ioapic_domain_cfg cfg = {
- .type = IOAPIC_DOMAIN_STRICT,
- .ops = &mp_ioapic_irqdomain_ops,
- };
-
- sb = (struct sfi_table_simple *)table;
- num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
- pentry = (struct sfi_apic_table_entry *)sb->pentry;
-
- for (i = 0; i < num; i++) {
- mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
- pentry++;
- }
-
- WARN(pic_mode, KERN_WARNING
- "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
- pic_mode = 0;
- return 0;
-}
-#endif /* CONFIG_X86_IO_APIC */
-
-/*
- * sfi_platform_init(): register lapics & io-apics
- */
-int __init sfi_platform_init(void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- register_lapic_address(sfi_lapic_addr);
- sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
-#endif
-#ifdef CONFIG_X86_IO_APIC
- sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
-#endif
- return 0;
-}
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile
index 6907b523e856..379777572bc9 100644
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -1,9 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y
# __restore_processor_state() restores %gs after S3 resume and so should not
# itself be stack-protected
CFLAGS_cpu.o := -fno-stack-protector
+# Clang may incorrectly inline functions with stack protector enabled into
+# __restore_processor_state(): https://bugs.llvm.org/show_bug.cgi?id=47479
+CFLAGS_REMOVE_cpu.o := $(CC_FLAGS_LTO)
+
obj-$(CONFIG_PM_SLEEP) += cpu.o
obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 7918b8415f13..d9bed596d849 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -21,6 +21,53 @@
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
#include <asm/frame.h>
+#include <asm/nospec-branch.h>
+
+ /* code below belongs to the image kernel */
+ .align PAGE_SIZE
+SYM_FUNC_START(restore_registers)
+ /* go back to the original page tables */
+ movq %r9, %cr3
+
+ /* Flush TLB, including "global" things (vmalloc) */
+ movq mmu_cr4_features(%rip), %rax
+ movq %rax, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4; # turn off PGE
+ movq %cr3, %rcx; # flush TLB
+ movq %rcx, %cr3
+ movq %rax, %cr4; # turn PGE back on
+
+ /* We don't restore %rax, it must be 0 anyway */
+ movq $saved_context, %rax
+ movq pt_regs_sp(%rax), %rsp
+ movq pt_regs_bp(%rax), %rbp
+ movq pt_regs_si(%rax), %rsi
+ movq pt_regs_di(%rax), %rdi
+ movq pt_regs_bx(%rax), %rbx
+ movq pt_regs_cx(%rax), %rcx
+ movq pt_regs_dx(%rax), %rdx
+ movq pt_regs_r8(%rax), %r8
+ movq pt_regs_r9(%rax), %r9
+ movq pt_regs_r10(%rax), %r10
+ movq pt_regs_r11(%rax), %r11
+ movq pt_regs_r12(%rax), %r12
+ movq pt_regs_r13(%rax), %r13
+ movq pt_regs_r14(%rax), %r14
+ movq pt_regs_r15(%rax), %r15
+ pushq pt_regs_flags(%rax)
+ popfq
+
+ /* Saved in save_processor_state. */
+ lgdt saved_context_gdt_desc(%rax)
+
+ xorl %eax, %eax
+
+ /* tell the hibernation core that we've just restored the memory */
+ movq %rax, in_suspend(%rip)
+
+ ret
+SYM_FUNC_END(restore_registers)
SYM_FUNC_START(swsusp_arch_suspend)
movq $saved_context, %rax
@@ -52,7 +99,7 @@ SYM_FUNC_START(swsusp_arch_suspend)
ret
SYM_FUNC_END(swsusp_arch_suspend)
-SYM_CODE_START(restore_image)
+SYM_FUNC_START(restore_image)
/* prepare to jump to the image kernel */
movq restore_jump_address(%rip), %r8
movq restore_cr3(%rip), %r9
@@ -66,11 +113,12 @@ SYM_CODE_START(restore_image)
/* jump to relocated restore code */
movq relocated_restore_code(%rip), %rcx
+ ANNOTATE_RETPOLINE_SAFE
jmpq *%rcx
-SYM_CODE_END(restore_image)
+SYM_FUNC_END(restore_image)
/* code below has been relocated to a safe page */
-SYM_CODE_START(core_restore_code)
+SYM_FUNC_START(core_restore_code)
/* switch to temporary page tables */
movq %rax, %cr3
/* flush TLB */
@@ -97,51 +145,6 @@ SYM_CODE_START(core_restore_code)
.Ldone:
/* jump to the restore_registers address from the image header */
+ ANNOTATE_RETPOLINE_SAFE
jmpq *%r8
-SYM_CODE_END(core_restore_code)
-
- /* code below belongs to the image kernel */
- .align PAGE_SIZE
-SYM_FUNC_START(restore_registers)
- /* go back to the original page tables */
- movq %r9, %cr3
-
- /* Flush TLB, including "global" things (vmalloc) */
- movq mmu_cr4_features(%rip), %rax
- movq %rax, %rdx
- andq $~(X86_CR4_PGE), %rdx
- movq %rdx, %cr4; # turn off PGE
- movq %cr3, %rcx; # flush TLB
- movq %rcx, %cr3
- movq %rax, %cr4; # turn PGE back on
-
- /* We don't restore %rax, it must be 0 anyway */
- movq $saved_context, %rax
- movq pt_regs_sp(%rax), %rsp
- movq pt_regs_bp(%rax), %rbp
- movq pt_regs_si(%rax), %rsi
- movq pt_regs_di(%rax), %rdi
- movq pt_regs_bx(%rax), %rbx
- movq pt_regs_cx(%rax), %rcx
- movq pt_regs_dx(%rax), %rdx
- movq pt_regs_r8(%rax), %r8
- movq pt_regs_r9(%rax), %r9
- movq pt_regs_r10(%rax), %r10
- movq pt_regs_r11(%rax), %r11
- movq pt_regs_r12(%rax), %r12
- movq pt_regs_r13(%rax), %r13
- movq pt_regs_r14(%rax), %r14
- movq pt_regs_r15(%rax), %r15
- pushq pt_regs_flags(%rax)
- popfq
-
- /* Saved in save_processor_state. */
- lgdt saved_context_gdt_desc(%rax)
-
- xorl %eax, %eax
-
- /* tell the hibernation core that we've just restored the memory */
- movq %rax, in_suspend(%rip)
-
- ret
-SYM_FUNC_END(restore_registers)
+SYM_FUNC_END(core_restore_code)
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 55b1ab378974..bddfc9a46645 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -29,14 +29,14 @@ posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity
hostprogs += insn_decoder_test insn_sanity
# -I needed for generated C source and C source which in the kernel tree.
-HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
+HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/
-HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+HOSTCFLAGS_insn_sanity.o := -Wall -I$(srctree)/tools/arch/x86/lib/ -I$(srctree)/tools/arch/x86/include/ -I$(objtree)/arch/x86/lib/
# Dependencies are also needed.
-$(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_decoder_test.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
-$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_sanity.o: $(srctree)/tools/arch/x86/lib/insn.c $(srctree)/tools/arch/x86/lib/inat.c $(srctree)/tools/arch/x86/include/asm/inat_types.h $(srctree)/tools/arch/x86/include/asm/inat.h $(srctree)/tools/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
hostprogs += relocs
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index 185ceba9d289..c6a0000ae635 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -14,10 +14,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-
-#define unlikely(cond) (cond)
-#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
-
#include <asm/insn.h>
#include <inat.c>
#include <insn.c>
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index ce7188cbdae5..04c5a44b9682 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -61,8 +61,8 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"(__iommu_table|__apicdrivers|__smp_locks)(|_end)|"
"__(start|end)_pci_.*|"
"__(start|end)_builtin_fw|"
- "__(start|stop)___ksymtab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
- "__(start|stop)___kcrctab(|_gpl|_unused|_unused_gpl|_gpl_future)|"
+ "__(start|stop)___ksymtab(|_gpl)|"
+ "__(start|stop)___kcrctab(|_gpl)|"
"__(start|stop)___param|"
"__(start|stop)___modver|"
"__(start|stop)___bug_table|"
@@ -867,9 +867,11 @@ static int do_reloc32(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
@@ -910,9 +912,11 @@ static int do_reloc_real(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
case R_386_PC32:
case R_386_PC16:
case R_386_PC8:
+ case R_386_PLT32:
/*
- * NONE can be ignored and PC relative relocations don't
- * need to be adjusted.
+ * NONE can be ignored and PC relative relocations don't need
+ * to be adjusted. Because sym must be defined, R_386_PLT32 can
+ * be treated the same way as R_386_PC32.
*/
break;
diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c
index e62174638f00..1dc9adc20b1c 100644
--- a/arch/x86/um/os-Linux/task_size.c
+++ b/arch/x86/um/os-Linux/task_size.c
@@ -145,7 +145,7 @@ out:
unsigned long os_get_top_address(void)
{
/* The old value of CONFIG_TOP_ADDR */
- return 0x7fc0000000;
+ return 0x7fc0002000;
}
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 51fd256c75f0..c3891c1ada26 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -7,8 +7,8 @@
#define __SYSDEP_STUB_H
#include <asm/ptrace.h>
+#include <generated/asm-offsets.h>
-#define STUB_SYSCALL_RET EAX
#define STUB_MMAP_NR __NR_mmap2
#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT)
@@ -77,17 +77,28 @@ static inline void trap_myself(void)
__asm("int3");
}
-static inline void remap_stack(int fd, unsigned long offset)
+static void inline remap_stack_and_trap(void)
{
- __asm__ volatile ("movl %%eax,%%ebp ; movl %0,%%eax ; int $0x80 ;"
- "movl %7, %%ebx ; movl %%eax, (%%ebx)"
- : : "g" (STUB_MMAP_NR), "b" (STUB_DATA),
- "c" (UM_KERN_PAGE_SIZE),
- "d" (PROT_READ | PROT_WRITE),
- "S" (MAP_FIXED | MAP_SHARED), "D" (fd),
- "a" (offset),
- "i" (&((struct stub_data *) STUB_DATA)->err)
- : "memory");
+ __asm__ volatile (
+ "movl %%esp,%%ebx ;"
+ "andl %0,%%ebx ;"
+ "movl %1,%%eax ;"
+ "movl %%ebx,%%edi ; addl %2,%%edi ; movl (%%edi),%%edi ;"
+ "movl %%ebx,%%ebp ; addl %3,%%ebp ; movl (%%ebp),%%ebp ;"
+ "int $0x80 ;"
+ "addl %4,%%ebx ; movl %%eax, (%%ebx) ;"
+ "int $3"
+ : :
+ "g" (~(UM_KERN_PAGE_SIZE - 1)),
+ "g" (STUB_MMAP_NR),
+ "g" (UML_STUB_FIELD_FD),
+ "g" (UML_STUB_FIELD_OFFSET),
+ "g" (UML_STUB_FIELD_CHILD_ERR),
+ "c" (UM_KERN_PAGE_SIZE),
+ "d" (PROT_READ | PROT_WRITE),
+ "S" (MAP_FIXED | MAP_SHARED)
+ :
+ "memory");
}
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 994df93c5ed3..6e2626b77a2e 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -7,8 +7,8 @@
#define __SYSDEP_STUB_H
#include <sysdep/ptrace_user.h>
+#include <generated/asm-offsets.h>
-#define STUB_SYSCALL_RET PT_INDEX(RAX)
#define STUB_MMAP_NR __NR_mmap
#define MMAP_OFFSET(o) (o)
@@ -82,18 +82,30 @@ static inline void trap_myself(void)
__asm("int3");
}
-static inline void remap_stack(long fd, unsigned long offset)
+static inline void remap_stack_and_trap(void)
{
- __asm__ volatile ("movq %4,%%r10 ; movq %5,%%r8 ; "
- "movq %6, %%r9; " __syscall "; movq %7, %%rbx ; "
- "movq %%rax, (%%rbx)":
- : "a" (STUB_MMAP_NR), "D" (STUB_DATA),
- "S" (UM_KERN_PAGE_SIZE),
- "d" (PROT_READ | PROT_WRITE),
- "g" (MAP_FIXED | MAP_SHARED), "g" (fd),
- "g" (offset),
- "i" (&((struct stub_data *) STUB_DATA)->err)
- : __syscall_clobber, "r10", "r8", "r9" );
+ __asm__ volatile (
+ "movq %0,%%rax ;"
+ "movq %%rsp,%%rdi ;"
+ "andq %1,%%rdi ;"
+ "movq %2,%%r10 ;"
+ "movq %%rdi,%%r8 ; addq %3,%%r8 ; movq (%%r8),%%r8 ;"
+ "movq %%rdi,%%r9 ; addq %4,%%r9 ; movq (%%r9),%%r9 ;"
+ __syscall ";"
+ "movq %%rsp,%%rdi ; andq %1,%%rdi ;"
+ "addq %5,%%rdi ; movq %%rax, (%%rdi) ;"
+ "int3"
+ : :
+ "g" (STUB_MMAP_NR),
+ "g" (~(UM_KERN_PAGE_SIZE - 1)),
+ "g" (MAP_FIXED | MAP_SHARED),
+ "g" (UML_STUB_FIELD_FD),
+ "g" (UML_STUB_FIELD_OFFSET),
+ "g" (UML_STUB_FIELD_CHILD_ERR),
+ "S" (UM_KERN_PAGE_SIZE),
+ "d" (PROT_READ | PROT_WRITE)
+ :
+ __syscall_clobber, "r10", "r8", "r9");
}
#endif
diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S
index a193e88536a9..8291899e6aaf 100644
--- a/arch/x86/um/stub_32.S
+++ b/arch/x86/um/stub_32.S
@@ -5,21 +5,22 @@
.globl batch_syscall_stub
batch_syscall_stub:
- /* load pointer to first operation */
- mov $(STUB_DATA+8), %esp
-
+ /* %esp comes in as "top of page" */
+ mov %esp, %ecx
+ /* %esp has pointer to first operation */
+ add $8, %esp
again:
/* load length of additional data */
mov 0x0(%esp), %eax
/* if(length == 0) : end of list */
/* write possible 0 to header */
- mov %eax, STUB_DATA+4
+ mov %eax, 0x4(%ecx)
cmpl $0, %eax
jz done
/* save current pointer */
- mov %esp, STUB_DATA+4
+ mov %esp, 0x4(%ecx)
/* skip additional data */
add %eax, %esp
@@ -38,6 +39,10 @@ again:
/* execute syscall */
int $0x80
+ /* restore top of page pointer in %ecx */
+ mov %esp, %ecx
+ andl $(~UM_KERN_PAGE_SIZE) + 1, %ecx
+
/* check return value */
pop %ebx
cmp %ebx, %eax
@@ -45,7 +50,7 @@ again:
done:
/* save return value */
- mov %eax, STUB_DATA
+ mov %eax, (%ecx)
/* stop */
int3
diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S
index 8a95c5b2eaf9..f3404640197a 100644
--- a/arch/x86/um/stub_64.S
+++ b/arch/x86/um/stub_64.S
@@ -4,9 +4,8 @@
.section .__syscall_stub, "ax"
.globl batch_syscall_stub
batch_syscall_stub:
- mov $(STUB_DATA), %rbx
- /* load pointer to first operation */
- mov %rbx, %rsp
+ /* %rsp has the pointer to first operation */
+ mov %rsp, %rbx
add $0x10, %rsp
again:
/* load length of additional data */
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index 27361cbb7ca9..21836eaf1725 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -11,10 +11,11 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
+ int stack;
ucontext_t *uc = p;
+ struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1));
- GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA),
- &uc->uc_mcontext);
+ GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext);
trap_myself();
}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index fc5c5ba4aacb..40b5779fce21 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-OBJECT_FILES_NON_STANDARD_xen-asm.o := y
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 9a5a50cdaab5..dc0a337f985b 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -567,10 +567,16 @@ void noist_exc_debug(struct pt_regs *regs);
DEFINE_IDTENTRY_RAW(xenpv_exc_nmi)
{
- /* On Xen PV, NMI doesn't use IST. The C part is the sane as native. */
+ /* On Xen PV, NMI doesn't use IST. The C part is the same as native. */
exc_nmi(regs);
}
+DEFINE_IDTENTRY_RAW_ERRORCODE(xenpv_exc_double_fault)
+{
+ /* On Xen PV, DF doesn't use IST. The C part is the same as native. */
+ exc_double_fault(regs, error_code);
+}
+
DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
{
/*
@@ -590,6 +596,20 @@ DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
BUG();
}
+#ifdef CONFIG_X86_MCE
+DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
+{
+ /*
+ * There's no IST on Xen PV, but we still need to dispatch
+ * to the correct handler.
+ */
+ if (user_mode(regs))
+ noist_exc_machine_check(regs);
+ else
+ exc_machine_check(regs);
+}
+#endif
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
@@ -608,9 +628,9 @@ struct trap_array_entry {
static struct trap_array_entry trap_array[] = {
TRAP_ENTRY_REDIR(exc_debug, true ),
- TRAP_ENTRY(exc_double_fault, true ),
+ TRAP_ENTRY_REDIR(exc_double_fault, true ),
#ifdef CONFIG_X86_MCE
- TRAP_ENTRY(exc_machine_check, true ),
+ TRAP_ENTRY_REDIR(exc_machine_check, true ),
#endif
TRAP_ENTRY_REDIR(exc_nmi, true ),
TRAP_ENTRY(exc_int3, false ),
@@ -1015,8 +1035,6 @@ void __init xen_setup_vcpu_info_placement(void)
*/
if (xen_have_vcpu_info_placement) {
pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
- pv_ops.irq.restore_fl =
- __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_ops.irq.irq_disable =
__PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
pv_ops.irq.irq_enable =
@@ -1053,7 +1071,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
.iret = xen_iret,
- .usergs_sysret64 = xen_sysret64,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
@@ -1078,9 +1095,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- /* Xen takes care of %gs when switching to usermode for us */
- .swapgs = paravirt_nop,
-
.start_context_switch = paravirt_start_context_switch,
.end_context_switch = xen_end_context_switch,
};
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 850c93f346c7..dfa091d79c2e 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -42,28 +42,6 @@ asmlinkage __visible unsigned long xen_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
-__visible void xen_restore_fl(unsigned long flags)
-{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* See xen_irq_enable() for why preemption must be disabled. */
- preempt_disable();
- vcpu = this_cpu_read(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
-
- if (flags == 0) {
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- xen_force_evtchn_callback();
- preempt_enable();
- } else
- preempt_enable_no_resched();
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
-
asmlinkage __visible void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
@@ -118,7 +96,6 @@ static void xen_halt(void)
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
- .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 3301875dd196..17d80f751fcb 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -416,6 +416,9 @@ void __init xen_vmalloc_p2m_tree(void)
xen_p2m_last_pfn = xen_max_p2m_pfn;
p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
+ if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC))
+ p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO;
+
vm.flags = VM_ALLOC;
vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
PMD_SIZE * PMDS_PER_MID_PAGE);
@@ -652,10 +655,9 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
pte_t *ptep;
unsigned int level;
- if (unlikely(pfn >= xen_p2m_size)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return true;
- }
+ /* Only invalid entries allowed above the highest p2m covered frame. */
+ if (unlikely(pfn >= xen_p2m_size))
+ return mfn == INVALID_P2M_ENTRY;
/*
* The interface requires atomic updates on p2m elements.
@@ -710,9 +712,12 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
for (i = 0; i < count; i++) {
unsigned long mfn, pfn;
+ struct gnttab_unmap_grant_ref unmap[2];
+ int rc;
/* Do not add to override if the map failed. */
- if (map_ops[i].status)
+ if (map_ops[i].status != GNTST_okay ||
+ (kmap_ops && kmap_ops[i].status != GNTST_okay))
continue;
if (map_ops[i].flags & GNTMAP_contains_pte) {
@@ -726,16 +731,51 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
- if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
- ret = -ENOMEM;
- goto out;
+ if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
+ continue;
+
+ /*
+ * Signal an error for this slot. This in turn requires
+ * immediate unmapping.
+ */
+ map_ops[i].status = GNTST_general_error;
+ unmap[0].host_addr = map_ops[i].host_addr,
+ unmap[0].handle = map_ops[i].handle;
+ map_ops[i].handle = INVALID_GRANT_HANDLE;
+ if (map_ops[i].flags & GNTMAP_device_map)
+ unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr;
+ else
+ unmap[0].dev_bus_addr = 0;
+
+ if (kmap_ops) {
+ kmap_ops[i].status = GNTST_general_error;
+ unmap[1].host_addr = kmap_ops[i].host_addr,
+ unmap[1].handle = kmap_ops[i].handle;
+ kmap_ops[i].handle = INVALID_GRANT_HANDLE;
+ if (kmap_ops[i].flags & GNTMAP_device_map)
+ unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr;
+ else
+ unmap[1].dev_bus_addr = 0;
}
+
+ /*
+ * Pre-populate both status fields, to be recognizable in
+ * the log message below.
+ */
+ unmap[0].status = 1;
+ unmap[1].status = 1;
+
+ rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, 1 + !!kmap_ops);
+ if (rc || unmap[0].status != GNTST_okay ||
+ unmap[1].status != GNTST_okay)
+ pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n",
+ rc, unmap[0].status, unmap[1].status);
}
out:
return ret;
}
-EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
struct gnttab_unmap_grant_ref *kunmap_ops,
@@ -750,20 +790,17 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT))
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ else
ret = -EINVAL;
- goto out;
- }
-
- set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
if (kunmap_ops)
ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
- kunmap_ops, count);
-out:
+ kunmap_ops, count) ?: ret;
+
return ret;
}
-EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
#ifdef CONFIG_XEN_DEBUG_FS
#include <linux/debugfs.h>
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 7eab14d56369..1a3b75652fa4 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -59,18 +59,6 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
-/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO (10)
-
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -790,20 +778,13 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
- * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
- * factor the base size. On non-highmem systems, the base
- * size is the full initial memory allocation; on highmem it
- * is limited to the max size of lowmem, so that it doesn't
- * get completely filled.
+ * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO
+ * factor the base size.
*
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
- *
- * In principle there could be a problem in lowmem systems if
- * the initial memory is also very large with respect to
- * lowmem, but we won't try to deal with that here.
*/
- extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages, max_pages - max_pfn);
i = 0;
addr = xen_e820_table.entries[0].addr;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 53cf8aa35032..1e626444712b 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -14,6 +14,7 @@
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/frame.h>
+#include <asm/unwind_hints.h>
#include <xen/interface/xen.h>
@@ -72,34 +73,6 @@ SYM_FUNC_START(xen_save_fl_direct)
ret
SYM_FUNC_END(xen_save_fl_direct)
-
-/*
- * In principle the caller should be passing us a value return from
- * xen_save_fl_direct, but for robustness sake we test only the
- * X86_EFLAGS_IF flag rather than the whole byte. After setting the
- * interrupt mask state, it checks for unmasked pending events and
- * enters the hypervisor to get them delivered if so.
- */
-SYM_FUNC_START(xen_restore_fl_direct)
- FRAME_BEGIN
- testw $X86_EFLAGS_IF, %di
- setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
- /*
- * Preempt here doesn't matter because that will deal with any
- * pending interrupts. The pending check may end up being run
- * on the wrong CPU, but that doesn't hurt.
- */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
- jnz 1f
- call check_events
-1:
- FRAME_END
- ret
-SYM_FUNC_END(xen_restore_fl_direct)
-
-
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
@@ -146,6 +119,7 @@ SYM_FUNC_END(xen_read_cr2_direct);
.macro xen_pv_trap name
SYM_CODE_START(xen_\name)
+ UNWIND_HINT_EMPTY
pop %rcx
pop %r11
jmp \name
@@ -161,7 +135,7 @@ xen_pv_trap asm_exc_overflow
xen_pv_trap asm_exc_bounds
xen_pv_trap asm_exc_invalid_op
xen_pv_trap asm_exc_device_not_available
-xen_pv_trap asm_exc_double_fault
+xen_pv_trap asm_xenpv_exc_double_fault
xen_pv_trap asm_exc_coproc_segment_overrun
xen_pv_trap asm_exc_invalid_tss
xen_pv_trap asm_exc_segment_not_present
@@ -172,7 +146,7 @@ xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
#ifdef CONFIG_X86_MCE
-xen_pv_trap asm_exc_machine_check
+xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
@@ -185,6 +159,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
+ UNWIND_HINT_EMPTY
pop %rcx
pop %r11
jmp early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE
@@ -211,30 +186,11 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* rsp->rax }
*/
SYM_CODE_START(xen_iret)
+ UNWIND_HINT_EMPTY
pushq $0
jmp hypercall_iret
SYM_CODE_END(xen_iret)
-SYM_CODE_START(xen_sysret64)
- /*
- * We're already on the usermode stack at this point, but
- * still with the kernel gs, so we can easily switch back.
- *
- * tss.sp2 is scratch space.
- */
- movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-
- pushq $__USER_DS
- pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
- pushq %r11
- pushq $__USER_CS
- pushq %rcx
-
- pushq $VGCF_in_syscall
- jmp hypercall_iret
-SYM_CODE_END(xen_sysret64)
-
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
@@ -251,7 +207,8 @@ SYM_CODE_END(xen_sysret64)
*/
/* Normal 64-bit system call target */
-SYM_FUNC_START(xen_syscall_target)
+SYM_CODE_START(xen_syscall_target)
+ UNWIND_HINT_EMPTY
popq %rcx
popq %r11
@@ -264,12 +221,13 @@ SYM_FUNC_START(xen_syscall_target)
movq $__USER_CS, 1*8(%rsp)
jmp entry_SYSCALL_64_after_hwframe
-SYM_FUNC_END(xen_syscall_target)
+SYM_CODE_END(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
-SYM_FUNC_START(xen_syscall32_target)
+SYM_CODE_START(xen_syscall32_target)
+ UNWIND_HINT_EMPTY
popq %rcx
popq %r11
@@ -282,10 +240,11 @@ SYM_FUNC_START(xen_syscall32_target)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
-SYM_FUNC_END(xen_syscall32_target)
+SYM_CODE_END(xen_syscall32_target)
/* 32-bit compat sysenter target */
-SYM_FUNC_START(xen_sysenter_target)
+SYM_CODE_START(xen_sysenter_target)
+ UNWIND_HINT_EMPTY
/*
* NB: Xen is polite and clears TF from EFLAGS for us. This means
* that we don't need to guard against single step exceptions here.
@@ -302,17 +261,18 @@ SYM_FUNC_START(xen_sysenter_target)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe
-SYM_FUNC_END(xen_sysenter_target)
+SYM_CODE_END(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
-SYM_FUNC_START_ALIAS(xen_syscall32_target)
-SYM_FUNC_START(xen_sysenter_target)
+SYM_CODE_START(xen_syscall32_target)
+SYM_CODE_START(xen_sysenter_target)
+ UNWIND_HINT_EMPTY
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
jmp hypercall_iret
-SYM_FUNC_END(xen_sysenter_target)
-SYM_FUNC_END_ALIAS(xen_syscall32_target)
+SYM_CODE_END(xen_sysenter_target)
+SYM_CODE_END(xen_syscall32_target)
#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 2d7c8f34f56c..cb6538ae2fe0 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -68,8 +68,9 @@ SYM_CODE_END(asm_cpu_bringup_and_idle)
.balign PAGE_SIZE
SYM_CODE_START(hypercall_page)
.rept (PAGE_SIZE / 32)
- UNWIND_HINT_EMPTY
- .skip 32
+ UNWIND_HINT_FUNC
+ .skip 31, 0x90
+ ret
.endr
#define HYPERCALL(n) \
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9546c3384c75..8d7ec49a35fb 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -131,15 +131,12 @@ static inline void __init xen_efi_init(struct boot_params *boot_params)
__visible void xen_irq_enable_direct(void);
__visible void xen_irq_disable_direct(void);
__visible unsigned long xen_save_fl_direct(void);
-__visible void xen_restore_fl_direct(unsigned long);
__visible unsigned long xen_read_cr2(void);
__visible unsigned long xen_read_cr2_direct(void);
/* These are not functions, and cannot be called normally */
__visible void xen_iret(void);
-__visible void xen_sysret32(void);
-__visible void xen_sysret64(void);
extern int xen_panic_handler_init(void);